In [29]:
def create_prompt(prompt,model_a_response,model_b_response):
    ''' Create a prompt asking to evalute the answer of the 2 model based on the base prompt and the two model answer. 

    Parameters:
    prompt (str) : The orignal prompt
    model_a_response (str)   : Answer generated by model a
    model_b_response (str)   : Answer generated by model b

    Returns:
    str  : Prompt given to gpt-4o-mini to evaluate the two answer. 
    '''
    content_system = '''
    You are an AI judge evaluating responses from two different models to determine which one provides a better answer to a given prompt. 
    Use the following step-by-step instructions to respond to user inputs.\n

    Step 1 - The user will provide you with the prompt delimited by <prompt>, the model a response delimited by <model_a_response> and the model b response delimited by <model_b_Response>. Analyze the prompt. \n

    Step 2 - Compare the two answer and look for similarity and difference. For the difference choose which model is wrong. \n

    Step 3 - Decide which model as a more conscice, factual and pedagogial answer. Give the result in JSON format while explaining why you choose this model. If the 2 answer are similar you can answer "Tie". \n

    Here is an example. 
    Question : <prompt> What is an hexagone </prompt> \n
    Model A response:
    <model_a_response>
    "An hexagone is a plane figure with six straight sides and angles."
    </model_a_response> \n
    Model B response:
    <model_b_response>
    ""An hexagone is a plane figure with seven straight sides and angles."
    </model_b_response> \n
    Answer: {
        Answer : "A", 
        Explication : "Model b is giving a false information by explaning that an hexagone has seven sides
    }
    '''
    content_user = f'''
    User prompt:
    <prompt>
    {prompt}
    </prompt> \n
    Model A response:
    <model_a_response>
    "{model_a_response}"
    </model_a_response> \n
    Model B response:
    <model_b_response>
    "{model_b_response}"
    </model_b_response> \n
     '''
    return content_system,content_user

In [33]:
import pandas as pd 
import numpy as np 



def load_dataset(path,n):
    ''' Load the prompt, the reponse of model a and response of model b and process it.

    Parameters:
    path (str) : path of csv file containing the dataset
    n (int)   : Number of row to load in ascending order. 
    
    Returns:
    list  : List of tuples containing the processed prompt,reponse a and b. 
    '''
    df = pd.read_csv(path)
    serie = df[['prompt','response_a','response_b']].iloc[0:n]
    data = []
    for row in serie.itertuples(index = False,name = None):
        processed_text = []
        for txt in row: 
            processed_text.append(process_string(txt,2))
        data.append(processed_text)
    return data

def process_string(str,n):
    ''' Process string str by removing the n first and last character.

    Parameters:
    str (str) : String to process
    n (int)   : Number of character to trim
    
    Returns:
    str : Processed string
    '''
    return str[n:len(str)-n]



In [39]:
from openai import OpenAI
import os 

openai_api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI(api_key = openai_api_key)
list = []

def inference(content):
    ''' Call OpenAI API to answer to a prompt based on content

    Parameters:
    content (list) : List of 3 string used to create the final prompt.
    
    Returns:
    completion : Answer of the model 
    '''
    content_system,content_user = create_prompt(content[0],content[1],content[2])
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": content_system},
            {
                "role": "user",
                "content": content_user
            }
        ],
        response_format={"type": "json_object"}
    )
    return completion


In [53]:
from timeit import default_timer as timer
import json

def processing_answer(answer):
    ''' Process the answer of the model an create a dataframe

    Parameters:
    answer (list) : List of chat completion
    
    Returns:
    dataframe : Dataframe with 2 rows : "Answer" and "Explication" and len(answer) row containing the model choice and the explication
    '''
    processed_answer_list = []
    for idx,ans in enumerate(answer):
        msg = ans.choices[0].message.content
        msg_json = json.loads(msg)
        processed_answer = (msg_json['Answer'],msg_json['Explication'])
        processed_answer_list.append(processed_answer)
    dataframe = pd.DataFrame(processed_answer_list,columns=["Answer","Explication"])
    return dataframe

def multiple_inference():
    ''' Multiple inference


    Returns:
    dataframe : Dataframe with 2 rows : "Answer" and "Explication" and len(answer) row containing the model choice and the explication
    list : list of chat completion
    '''
    print(f"Start of data loading")
    start = timer()
    dataset = load_dataset('data/train.csv',5)
    end = timer()
    print(f"End of data loading after {end-start}s")

    answer = []
    print(f"Start of Inference - Number of inference : {len(dataset)}")
    start = timer()
    for data in dataset:
        answer.append(inference(data))
    
    end = timer()
    print(f"End of Inference after {end-start}s")

    return processing_answer(answer),answer


In [55]:
df,raw = multiple_inference()
df

Start of data loading
End of data loading after 1.6284766000007949s
Start of Inference - Number of inference : 5
End of Inference after 17.559931800000413s


Unnamed: 0,Answer,Explication
0,A,"Model A provides a more comprehensive, detaile..."
1,B,Model B provides a clearer and more organized ...
2,B,Model B provides a more comprehensive explanat...
3,A,Model A provides a more comprehensive and in-d...
4,B,Model B provides a more accurate and concise a...
