In [1]:
import base64
import os
from google import genai
from google.genai import types
from pydantic import BaseModel
import json
import pprint as pp
import dotenv as env
import sys
import pandas
env.load_dotenv('../keys/keys.env')

True

In [2]:
class HeadlineClassification(BaseModel):
    link_id: str
    title: str                          #Do not change the name of this field as OpenAI is dumb. If you keep it as inputs, then only it returns the original text sent to it.
                                        # Will have to carefully write wrappers to ensure that the outcome df to be used in the final output has the right headers. 
    classification: bool
    explanation: str

In [3]:
class Gemini_Models:
    __model_name = 'gemini-2.5-pro-exp-03-25'          #produces most realiable output
    #__model_name = 'o4-mini-2025-04-16'
    __prompt_file_path = '../prompts/gemini_ai_prompts.json'

    def __init__(self):
        self.__client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY")) 
        
    def __getPromptFromFile(self, type: str) -> str:
        with open(Gemini_Models.__prompt_file_path, 'r') as file:
            data = file.read()
        parsed_data = json.loads(data)
        prompt = parsed_data.get(type)
        return prompt
    
    def classify_headlines(self, input: pandas.DataFrame, silent_mode=True) -> str:
        if(silent_mode):
            original_stdout = sys.stdout   # save the original stdout
            sys.stdout = open(os.devnull, 'w')
        prompt = self.__getPromptFromFile('headlines_classifier_real_estate') 
        #print(prompt)
        print('Sending titles for classification to Gemini model...')
        print(f'Input is of length: {len(input)}.')
        generation_config = types.GenerateContentConfig(
                                temperature=0.8,
                                #response_mime_type="text/plain", # for simpler use cases where only text is expected from the model 
                                system_instruction=[
                                                    types.Part.from_text(text=prompt),
                                                    ],
                                response_mime_type='application/json',
                                response_schema=list[HeadlineClassification],    #force response in a structured format from Gemini
                            )
        contents = [types.Content(
                        role="user",
                        parts=[
                                types.Part.from_text(text=input.to_json(orient='records')),    
                            ],
                        ),
                    ]
    
        try:
            model_reply = self.__client.models.generate_content(model=self.__model_name, contents=contents, config=generation_config)
            return model_reply.text
        except Exception as e:
            print(f'Gemini AI execution threw an exception: {e}')
            return None
        finally:
            if(silent_mode):
                sys.stdout.close()
                sys.stdout = original_stdout
        

In [4]:
#One line tester

#llm = Gemini_Models()
#df = pandas.DataFrame([['Mumbai', 'Ghatkopar college student among two to drown i...'], 
#                       ['Mumbai', 'NCP-SP MLA Jitendra Awhad receives death threats'], 
#                       ['Mumbai', 'Mumbai court says life term for rapist dad too']], 
#                      columns=['sub-site', 'title'])
#response = llm.classify_headlines(df[['sub-site', 'title']], silent_mode=False)
#out_df = pandas.DataFrame(json.loads(response), columns=['title', 'classification', 'explanation'])
#out_df