In [None]:
import FinFeedRAG
import random
#Generate evaluation prompts with gpt-3.5-turbo and store them in evaluation_prompts.txt
def generate_evaluation_prompts():

    llm = ChatOpenAI(model="gpt-3.5-turbo")
    answer = llm.invoke("Please generate a list of 100 questions that someone may ask about the current state of the economy.")

    file = open("Evaluation_experiments/evaluation_prompts.txt", "w")
    file.write(answer.content)

#Populate prompts from evaluation_prompts.txt
def load_evaluation_user_prompts():

    prompts = []

    #Open .txt file
    file = open("Evaluation_experiments/evaluation_prompts.txt", "r")

    #Read the file line by line, strip each lines of leading and trailing numerals, periods, new line symbols, and spaces, and append it to prompts.
    while True:
        line = file.readline()

        if not line:
            break

        line = line.strip('0123456789. \n')
        prompts.append(line)

    return prompts

def combined_bots(user_prompt):

    #Base answer
    base_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)
    base_answer = base_model.invoke(user_prompt).content

    #Our answer

    #initialize our model
    bot = FinFeedRAG(pine_cone_api_key=os.getenv('PINECONE_API_KEY'), openai_api_key=os.getenv('OPENAI_API_KEY'), pinecone_index='latest-news')
    our_answer = bot.chain_for_eval(user_prompt)

    #Combine answers
    integrating_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)

    template = ChatPromptTemplate.from_messages([
            ("system", "You will be given two answers provided by different chatbots to the following user prompt:"),
            ("system", "User_prompt: " + user_prompt),
            ("system", "The following are the answers provided by the chatbots:"),
            ("system", "Answer 1: " + base_answer),
            ("system", "Answer 2: " + our_answer),
            ("system", "Please combine both answers into a single, coherent, and comprehenive answer.")])


        
    chaining = ({} | template | integrating_model)
    combined_answer = chaining.invoke([''])

    return combined_answer.content
#Evaluation function 
#This function evaluates our integrated bot against chatgpt_3.5-turbo

def model_evaluation(user_prompts, evaluator_prompt):
    
    our_answers = []
    benchmark_answers = []
    evaluations = []
    preferred_answers = []
    
    for user_prompt in user_prompts:

        #Answers
        our_answer = combined_bots(user_prompt)

        #Benchmark answer
        benchmark_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)
        benchmark_answer = benchmark_model.invoke(user_prompt).content

        #Evaluation
        template = ChatPromptTemplate.from_messages([
            ("system", evaluator_prompt),
            ("system", "User_prompt: " + user_prompt),
            ("system", "Answer 1: " + benchmark_answer),
            ("system", "Answer 2: " + our_answer)])
        
        chaining = ({} | template | benchmark_model)
        evaluation = chaining.invoke([''])

        our_answers.append(our_answer)
        benchmark_answers.append(benchmark_answer)
        evaluations.append(evaluation.content)

    #Extract preferred answer as an integer 
    for evaluation in evaluations:

        #Benchmark answer
        evaluator_model = ChatOpenAI(model="gpt-3.5-turbo",temperature=0)

        #Evaluation
        template = ChatPromptTemplate.from_messages([
            ("system", "Determine what is the user's preferred answer. Please restrict your output to a single integer."),
            ("system", "User_prompt: " + evaluation)])
        
        chaining = ({} | template | evaluator_model)
        preferred_answer = chaining.invoke([''])
        preferred_answers.append(int(preferred_answer.content))
    
    return benchmark_answers, our_answers, evaluations, preferred_answers
    

In [None]:
#Evaluate model on prompts
prompts = load_evaluation_user_prompts()

evaluator_prompt = "You are a helpful finance bot evaluator. You will be given a user prompt, and the corresponding answers of two different bots. The user prompt is likely concerned with very current matters, so the bots may or may not have access to relevant information. Please e valuate which answer provides the most relevant answer. Based on this, which  do you consider the best answer?."#. Please provide a reason for your choice.""#. Please provide a reason for your choice."

#We only evalaute on 50 prompts chosen randomly. 
benchmark_answers, our_answers, evaluations, preferred_answers = model_evaluation(random.sample(prompts, 50),evaluator_prompt)

In [None]:
#Store results of the evaluation 

#Set the expriment number for bookeeping
experiment_number = 1

#Create a dataframe with results and save to a .pkl file
data = {'benchmark_answer':benchmark_answers,'our_answer': our_answers, 'llm_evaluation':evaluations,'preferred_model':preferred_answers}
df = pd.DataFrame(data=data)

#Add the evaluation prompt as an attribute to the dataframe
df.attrs['evaluator_prompt'] = evaluator_prompt

df.to_pickle('Evaluation_experiments/Experiment_' + str(experiment_number) + '/results_experiment_' + str(experiment_number) + '.pkl')

#Create bar plot
labels = ['Benchmark','FinFeed']
values = [(df['preferred_model']==1).sum(),(df['preferred_model']==2).sum()]

fig = plt.figure(figsize = (5, 5))
 
# creating the bar plot
plt.bar(labels, values)
 
plt.xlabel("Models")
plt.ylabel("Number of times chosen by the evaluator")
plt.title('Results of evaluation')

plt.savefig('Evaluation_experiments/Experiment_' + str(experiment_number) + '/barplot_experiment_' + str(experiment_number) + '.png')