In [1]:
import os
import dotenv as env
from pydantic import BaseModel
from openai import OpenAI
import json
import pandas
import pprint as pp
import sys

env.load_dotenv('../keys/keys.env')

True

In [2]:
class HeadlineClassification(BaseModel):
    title: str                          #Do not change the name of this field as OpenAI is dumb. If you keep it as inputs, then only it returns the original text sent to it.
                                        # Will have to carefully write wrappers to ensure that the outcome df to be used in the final output has the right headers. 
    classification: bool
    explanation: str

In [3]:
class listClassifications(BaseModel):
    output: list[HeadlineClassification]

In [4]:
class OpenAIModels:
    #__client: 
    #__model_name = 'gpt-4.1-nano-2025-04-14'
    __model_name = 'gpt-4o-mini-2024-07-18'          #produces most realiable output
    #__model_name = 'o4-mini-2025-04-16'
    __prompt_file_path = '../prompts/open_ai_prompts.json'

    def __init__(self):
        #self.__model_name = 'gpt-4.1-nano-2025-04-14'
        #self.__prompt_file_path = '../prompts/open_ai_prompts.json'
        self.__client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    
    def __getPromptFromFile(self, type: str) -> str:
        with open(OpenAIModels.__prompt_file_path, 'r') as file:
            data = file.read()
        parsed_data = json.loads(data)
        prompt = parsed_data.get(type)
        return prompt

    def classify_headlines(self, input: pandas.DataFrame, silent_mode=True) -> str :
        if(silent_mode):
            original_stdout = sys.stdout   # save the original stdout
            sys.stdout = open(os.devnull, 'w')
        prompt = self.__getPromptFromFile('headlines_classifier_real_estate') 
        #print(prompt)
        print('Sending titles for classification to OpenAI model...')
        print(f'Input is of length: {len(input)}.')
        try:
            response = self.__client.responses.parse(
                        model=OpenAIModels.__model_name,
                        #service_tier='default',
                        temperature=0.8,
                        instructions=prompt, 
                        input=input.to_json(orient='records'),
                        text_format=listClassifications
                    )
            return response.output_text
        except Exception as e:
            print(f'Open AI execution threw an exception: {e}')
            return None
        finally:
            if(silent_mode):
                sys.stdout.close()
                sys.stdout = original_stdout    #restore the original stdout, other wise it will fail other prints.
            #Since the textformat here is a class wrapper of lists
            # So the return response must be handled to strip the 'output' dictionary to get to the inner lists

In [5]:
#one line tester

#llm = OpenAIModels()
#df = pandas.DataFrame([['Mumbai', 'Ghatkopar college student among two to drown i...'], 
#                       ['Mumbai', 'NCP-SP MLA Jitendra Awhad receives death threats'], 
#                       ['Mumbai', 'Mumbai court says life term for rapist dad too']], 
#                      columns=['sub-site', 'title'])
#response = llm.classify_headlines(df[['sub-site', 'title']], silent_mode=False)
#out_df = pandas.DataFrame(json.loads(response)['output'], columns=['title', 'classification', 'explanation'])
#out_df