## The purpose of this is to demo the pipeline and analytics functionality of LLMResponseMetrics. Please see the readme for more information. 


In [13]:
from functions import *  # this is pulling functions from the function.py file in the code subfolder
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import string

## This is the code for running the core pipeline functionality

In [2]:
models_dict = {
    #'claude-2.1':  "ANTHROPIC", 
    'gpt-3.5-turbo-0301': "OPENAI"
               }  # these are the models that you want to actually test
csv_file_path = '../data/prompt_target_answer_pairs.csv' # set this filepath to the file that contains your question-answer pairs
similarity_model_name = 'sentence-transformers/paraphrase-mpnet-base-v2' # this is what you use for analyzing semantic similarity
temperature = "variable" # you can set this to a number between 0 and 1 if you don't want to vary temperature for the model responses
is_file_path = True # you can set this to false if you want to input data directly instead of via a file
llm_evaluation_model = ['gpt-4', "OPENAI"] # this is the model that will compare your target answer to the actual responses
instructions = "Please answer thoroughly: "
perturbation_model = ['gpt-4', "OPENAI"] # I recommend using a good model for perturbations otherwise it may generate the wrong number
stability_threshold= 3 # this is the number of period that the maximum score will have to be stable across all evaluation criteria before each prompt will stop running
max_runs= 6  # this is the maximum number of runs that each prompt will get run if the stability threshold is never met

pipeline = LLMAnalysisPipeline(
    input_data=csv_file_path, 
    models_dict=models_dict, 
    perturbation_model=perturbation_model, 
    llm_evaluation_model=llm_evaluation_model,
    temperature = temperature,
    max_runs= max_runs,
    is_file_path = is_file_path,
    similarity_model_name = similarity_model_name,
    instructions = instructions,
    stability_threshold = stability_threshold

)

# Run the pipeline
df_responses = pipeline.run_pipeline()

2023-12-28 13:31:53,669 - INFO - Reading prompts from CSV file.
2023-12-28 13:32:03,963 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-28 13:32:03,970 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-28 13:32:17,597 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-28 13:32:17,601 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-28 13:32:28,707 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-28 13:32:28,711 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-28 13:32:51,772 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-28 13:32:51,776 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-28 13:33:09,319 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-28 13:33:09,324 - INFO 

## This is what the response table looks like

In [3]:
df_responses

Unnamed: 0,model,original_prompt,response,temperature,actual_prompt,run_number,similarity_score,keyword_score,llm_rating,keywords
0,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,"The cell's powerhouse is the mitochondria, whi...",0.391784,- Can you explain the function of the cell's p...,0,0.910657,0.642857,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
1,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,The powerhouse of the cell is the mitochondria...,0.496671,What is the powerhouse of the cell and how doe...,1,0.926819,0.642857,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
2,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,The energy production center of the cell is kn...,0.97708,- What is the energy production center of the ...,2,0.849657,0.5,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
3,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,"The cell's energy factory is the mitochondria,...",0.361931,- Can you describe the cell's energy factory a...,3,0.88385,0.571429,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
4,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The key unit of life is the cell. A cell is th...,0.336419,- What is the key unit of life and what is its...,0,0.815932,0.2,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."
5,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The basic unit of life is the cell. This means...,0.525557,What is the basic unit of life and what does t...,1,0.911885,0.133333,0.9,"[cell, biology, prokaryotic cells, eukaryotic ..."
6,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The fundamental unit of life is the cell. A ce...,0.261278,- What is the fundamental unit of life and can...,2,0.837828,0.2,0.9,"[cell, biology, prokaryotic cells, eukaryotic ..."
7,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The elemental unit of life is the cell. It den...,0.592574,- What is the elemental unit of life and what ...,3,0.780932,0.333333,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."
8,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The essential unit of life is the cell. A cell...,0.185147,- What is the essential unit of life and what ...,4,0.823436,0.266667,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."
9,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The simplest unit of life is the cell. A cell ...,0.945666,- What is the simplest unit of life and what i...,5,0.845666,0.133333,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."


## This shows how many times each prompt ran and which run produced the best score

In [4]:
def analyze_best_scores(df_responses):
    methods = ['similarity_score', 'keyword_score', 'llm_rating']
    
    # Group by model, original prompt, and run number, then compute the max score for each method
    max_scores = df_responses.groupby(['model', 'original_prompt', 'run_number'])[methods].max()

    # Find the run number with the highest score for each method per prompt and model
    best_runs = max_scores.groupby(level=[0, 1]).idxmax()

    # Extract the run number and create a DataFrame
    best_run_info = best_runs.map(lambda x: x[2] if pd.notna(x) else None)
    best_run_info.columns = [f'best_run_{method}' for method in methods]

    # Add total run count for each prompt and model
    total_runs = df_responses.groupby(['model', 'original_prompt'])['run_number'].nunique()
    best_run_info['total_runs'] = total_runs

    best_run_info.reset_index(inplace=True)
    best_run_info.rename(columns={'original_prompt': 'prompt'}, inplace=True)

    return best_run_info

df_best_scores = analyze_best_scores(df_responses)
df_best_scores


Unnamed: 0,model,prompt,best_run_similarity_score,best_run_keyword_score,best_run_llm_rating,total_runs
0,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,4,0,2,6
1,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,5,4,0,6
2,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,1,3,1,6
3,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,1,0,0,4
4,gpt-3.5-turbo-0301,What particle stabilizes the membrane of hover...,0,0,0,3
5,gpt-3.5-turbo-0301,Which organelle contains the cell's futurogeni...,0,0,0,3


## This shows the best response for each prompt by similarity score

In [5]:
aggregate_best_scores(df_responses, "similarity_score")

Unnamed: 0,model,original_prompt,actual_prompt,response,similarity_score
26,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,- How do astroflora execute the process of pho...,"Astroflora, also known as space plants, are pl...",0.846397
15,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,What do genes consist of and how do they work?,Genes are the basic units of heredity and cons...,0.816292
5,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,What is the basic unit of life and what does t...,The basic unit of life is the cell. This means...,0.911885
1,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,What is the powerhouse of the cell and how doe...,The powerhouse of the cell is the mitochondria...,0.926819
19,gpt-3.5-turbo-0301,What particle stabilizes the membrane of hover...,- What is the particle that maintains the stab...,"As an AI language model, I cannot find any sci...",0.655885
16,gpt-3.5-turbo-0301,Which organelle contains the cell's futurogeni...,Which organelle contains the cell's futurogeni...,The organelle that contains the cell's genetic...,0.473274


## This shows the best response for each prompt by keyword score

In [6]:
aggregate_best_scores(df_responses, 'keyword_score')

Unnamed: 0,model,original_prompt,actual_prompt,response,keyword_score
22,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,- What is the mechanism of photosynthesis in a...,"As an AI language model, I don't have informat...",0.363636
14,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,- What are the constituents of genes and how d...,The constituents of genes are DNA (deoxyribonu...,0.846154
7,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,- What is the elemental unit of life and what ...,The elemental unit of life is the cell. It den...,0.333333
0,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,- Can you explain the function of the cell's p...,"The cell's powerhouse is the mitochondria, whi...",0.642857
19,gpt-3.5-turbo-0301,What particle stabilizes the membrane of hover...,- What is the particle that maintains the stab...,"As an AI language model, I cannot find any sci...",0.111111
16,gpt-3.5-turbo-0301,Which organelle contains the cell's futurogeni...,Which organelle contains the cell's futurogeni...,The organelle that contains the cell's genetic...,0.0


## This shows the best response for each prompt by LLM rating

In [7]:
aggregate_best_scores(df_responses, 'llm_rating')

Unnamed: 0,model,original_prompt,actual_prompt,response,llm_rating
24,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,- How is it possible for astroflora to perform...,"Astroflora, also known as space plants, are pl...",0.8
10,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,- What constitutes genes and how do they perfo...,Genes are segments of DNA that contain the ins...,0.9
5,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,What is the basic unit of life and what does t...,The basic unit of life is the cell. This means...,0.9
0,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,- Can you explain the function of the cell's p...,"The cell's powerhouse is the mitochondria, whi...",0.9
19,gpt-3.5-turbo-0301,What particle stabilizes the membrane of hover...,- What is the particle that maintains the stab...,"As an AI language model, I cannot find any sci...",0.1
16,gpt-3.5-turbo-0301,Which organelle contains the cell's futurogeni...,Which organelle contains the cell's futurogeni...,The organelle that contains the cell's genetic...,0.1


## This shows the spread in terms of responses by prompt and method

In [8]:
def calculate_spread(df_responses):
    methods = ['similarity_score', 'keyword_score', 'llm_rating']
    
    # Group by model and original prompt
    grouped = df_responses.groupby(['model', 'original_prompt'])

    # Aggregate the data to find the max, min, and std for each method
    agg_functions = {method: ['max', 'min'] for method in methods}
    spread_df = grouped.agg(agg_functions).reset_index()

    # Flatten MultiIndex columns
    spread_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in spread_df.columns.values]

    return spread_df

calculate_spread(df_responses)

Unnamed: 0,model,original_prompt,similarity_score_max,similarity_score_min,keyword_score_max,keyword_score_min,llm_rating_max,llm_rating_min
0,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,0.846397,0.664994,0.363636,0.272727,0.8,0.2
1,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,0.816292,0.684582,0.846154,0.230769,0.9,0.9
2,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,0.911885,0.780932,0.333333,0.133333,0.9,0.8
3,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,0.926819,0.849657,0.642857,0.5,0.9,0.9
4,gpt-3.5-turbo-0301,What particle stabilizes the membrane of hover...,0.655885,0.5079,0.111111,0.0,0.1,0.0
5,gpt-3.5-turbo-0301,Which organelle contains the cell's futurogeni...,0.473274,0.436505,0.0,0.0,0.1,0.1
