## The purpose of this is to demo the pipeline and analytics functionality of LLMResponseMetrics. Please see the readme for more information. 


In [1]:
from functions import *  # this is pulling functions from the function.py file in the code subfolder
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

## This is the code for running the core pipeline functionality

In [2]:
models_dict = {
    'claude-2.1':  "ANTHROPIC", 
    'gpt-3.5-turbo-0301': "OPENAI"
               }  # these are the models that you want to actually test
csv_file_path = '../data/prompt_target_answer_pairs.csv' # set this filepath to the file that contains your question-answer pairs
similarity_model_name = 'sentence-transformers/paraphrase-mpnet-base-v2' # this is what you use for analyzing semantic similarity
temperature = "variable" # you can set this to a number between 0 and 1 if you don't want to vary temperature for the model responses
is_file_path = True # you can set this to false if you want to input data directly instead of via a file
llm_evaluation_model = ['gpt-4', "OPENAI"] # this is the model that will compare your target answer to the actual responses
instructions = "Please answer thoroughly: "
perturbation_model = ['gpt-4', "OPENAI"] # I recommend using a good model for perturbations otherwise it may generate the wrong number
stability_threshold= 3 # this is the number of period that the maximum score will have to be stable across all evaluation criteria before each prompt will stop running
max_runs= 6  # this is the maximum number of runs that each prompt will get run if the stability threshold is never met

pipeline = LLMAnalysisPipeline(
    input_data=csv_file_path, 
    models_dict=models_dict, 
    perturbation_model=perturbation_model, 
    llm_evaluation_model=llm_evaluation_model,
    temperature = temperature,
    max_runs= max_runs,
    is_file_path = is_file_path,
    similarity_model_name = similarity_model_name,
    instructions = instructions,
    stability_threshold = stability_threshold

)

# Run the pipeline
df_responses = pipeline.run_pipeline()

2023-12-21 15:25:31,603 - INFO - Reading prompts from CSV file.
2023-12-21 15:25:48,435 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-21 15:25:48,440 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-21 15:26:03,331 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-21 15:26:03,335 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-21 15:26:09,880 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-21 15:26:09,884 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-21 15:26:31,659 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-21 15:26:31,666 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2023-12-21 15:26:54,403 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-21 15:26:54,411 - INFO 

## This is what the response table looks like

In [4]:
df_responses

Unnamed: 0,model,original_prompt,response,temperature,actual_prompt,run_number,similarity_score,keyword_score,llm_rating,true_or_false,keywords
0,claude-2.1,What is the powerhouse of the cell and how doe...,The cell's main energy generator is the mitoc...,0.277427,- What is the cell's main energy generator and...,0,0.772294,0.571429,0.9,True,"[mitochondrion, mitochondria, atp, adenosine t..."
1,claude-2.1,What is the powerhouse of the cell and how doe...,The main energy provider of the cell is the m...,0.846408,- What is the main energy provider of the cell...,1,0.798746,0.428571,0.9,True,"[mitochondrion, mitochondria, atp, adenosine t..."
2,claude-2.1,What is the powerhouse of the cell and how doe...,"The cell's energy factory, also known as the ...",0.957211,- Can you tell me about the cell's energy fact...,2,0.845962,0.428571,0.9,True,"[mitochondrion, mitochondria, atp, adenosine t..."
3,claude-2.1,What is the powerhouse of the cell and how doe...,The main energy provider of the cell is the m...,0.17794,- What is the main energy provider of the cell...,3,0.803669,0.428571,0.8,True,"[mitochondrion, mitochondria, atp, adenosine t..."
4,claude-2.1,What is the powerhouse of the cell and how doe...,The primary power source of a cell is the mit...,0.151483,- What is the primary power source of a cell a...,4,0.872189,0.571429,0.9,True,"[mitochondrion, mitochondria, atp, adenosine t..."
5,claude-2.1,What is the powerhouse of the cell and how doe...,The main energy provider of the cell is the m...,0.501044,- What is the main energy provider of the cell...,5,0.802129,0.571429,0.8,True,"[mitochondrion, mitochondria, atp, adenosine t..."
6,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,The main energy provider of the cell is a mole...,0.051949,- What is the main energy provider of the cell...,0,0.768067,0.5,0.8,True,"[mitochondrion, mitochondria, atp, adenosine t..."
7,claude-2.1,What is the basic unit of life and what does t...,The foundational unit of life is the cell. Ce...,0.406898,- What is the foundational unit of life and wh...,0,0.801457,0.133333,0.7,True,"[cell, biology, prokaryotic cells, eukaryotic ..."
8,claude-2.1,What is the basic unit of life and what does t...,"There is no definitive ""basic component of li...",0.803332,- What is the basic component of life and what...,1,0.772475,0.2,0.7,True,"[cell, biology, prokaryotic cells, eukaryotic ..."
9,claude-2.1,What is the basic unit of life and what does t...,The elemental unit of life is the cell. A cel...,0.875615,- What is the elemental unit of life and what ...,2,0.830411,0.2,0.9,True,"[cell, biology, prokaryotic cells, eukaryotic ..."


## This shows how many times each prompt ran and which run produced the best score

In [11]:
def analyze_best_scores(df_responses):
    methods = ['similarity_score', 'keyword_score', 'llm_rating']
    
    # Group by model, original prompt, and run number, then compute the max score for each method
    max_scores = df_responses.groupby(['model', 'original_prompt', 'run_number'])[methods].max()

    # Find the run number with the highest score for each method per prompt and model
    best_runs = max_scores.groupby(level=[0, 1]).idxmax()

    # Extract the run number and create a DataFrame
    best_run_info = best_runs.map(lambda x: x[2] if pd.notna(x) else None)
    best_run_info.columns = [f'best_run_{method}' for method in methods]

    # Add total run count for each prompt and model
    total_runs = df_responses.groupby(['model', 'original_prompt'])['run_number'].nunique()
    best_run_info['total_runs'] = total_runs

    best_run_info.reset_index(inplace=True)
    best_run_info.rename(columns={'original_prompt': 'prompt'}, inplace=True)

    return best_run_info

df_best_scores = analyze_best_scores(df_responses)
df_best_scores


Unnamed: 0,model,prompt,best_run_similarity_score,best_run_keyword_score,best_run_llm_rating,total_runs
0,claude-2.1,How do astroflora perform photosynthesis in lo...,1,0,1,4
1,claude-2.1,What do genes consist of and how do they work?,2,0,0,5
2,claude-2.1,What is the basic unit of life and what does t...,3,3,2,6
3,claude-2.1,What is the powerhouse of the cell and how doe...,4,0,0,6
4,claude-2.1,What particle stabilizes the membrane of hover...,2,1,0,5
5,claude-2.1,Which organelle contains the cell's futurogeni...,0,0,0,3
6,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,1,1,3,6
7,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,0,0,0,3
8,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,0,0,0,1
9,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,0,0,0,1


## This shows the best response for each prompt by similarity score

In [6]:
aggregate_best_scores(df_responses, "similarity_score")

Unnamed: 0,model,original_prompt,actual_prompt,response,true_or_false,similarity_score
37,claude-2.1,How do astroflora perform photosynthesis in lo...,How do astroflora perform photosynthesis in lo...,"Unfortunately, I do not have enough informati...",False,0.679007
16,claude-2.1,What do genes consist of and how do they work?,What do genes consist of and how do they work?,Genes consist of DNA and are the basic units ...,True,0.811088
10,claude-2.1,What is the basic unit of life and what does t...,- What is the basic component of life and what...,Here is a thorough answer:\n\nThe basic compo...,True,0.851599
4,claude-2.1,What is the powerhouse of the cell and how doe...,- What is the primary power source of a cell a...,The primary power source of a cell is the mit...,True,0.872189
30,claude-2.1,What particle stabilizes the membrane of hover...,- Can you identify the particle that ensures t...,"Unfortunately, I do not have enough context t...",False,0.611484
22,claude-2.1,Which organelle contains the cell's futurogeni...,- Can you identify the organelle that stores t...,"Sure, I'd be happy to provide a thorough expl...",False,0.4948
41,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,- How do astroflora process photosynthesis in ...,"Astroflora, also known as space plants, are pl...",False,0.865393
19,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,- What are genes made of and how do they operate?,Genes are made up of DNA (Deoxyribonucleic Aci...,True,0.844998
13,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,- What is the essential building block of life...,The essential building block of life is the ce...,True,0.687081
6,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,- What is the main energy provider of the cell...,The main energy provider of the cell is a mole...,True,0.768067


## This shows the best response for each prompt by keyword score

In [7]:
aggregate_best_scores(df_responses, 'keyword_score')

Unnamed: 0,model,original_prompt,actual_prompt,response,true_or_false,keyword_score
36,claude-2.1,How do astroflora perform photosynthesis in lo...,"- In low light conditions, how do astroflora c...","Unfortunately, I do not have enough informati...",False,0.272727
14,claude-2.1,What do genes consist of and how do they work?,- What are the components of genes and their f...,Here is a thorough explanation of the compone...,True,0.692308
10,claude-2.1,What is the basic unit of life and what does t...,- What is the basic component of life and what...,Here is a thorough answer:\n\nThe basic compo...,True,0.266667
0,claude-2.1,What is the powerhouse of the cell and how doe...,- What is the cell's main energy generator and...,The cell's main energy generator is the mitoc...,True,0.571429
29,claude-2.1,What particle stabilizes the membrane of hover...,- Which particle is responsible for stabilizin...,"Unfortunately, I do not have enough context t...",False,0.222222
22,claude-2.1,Which organelle contains the cell's futurogeni...,- Can you identify the organelle that stores t...,"Sure, I'd be happy to provide a thorough expl...",False,0.0
41,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,- How do astroflora process photosynthesis in ...,"Astroflora, also known as space plants, are pl...",False,0.363636
19,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,- What are genes made of and how do they operate?,Genes are made up of DNA (Deoxyribonucleic Aci...,True,0.615385
13,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,- What is the essential building block of life...,The essential building block of life is the ce...,True,0.133333
6,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,- What is the main energy provider of the cell...,The main energy provider of the cell is a mole...,True,0.5


## This shows the best response for each prompt by LLM rating

In [8]:
aggregate_best_scores(df_responses, 'llm_rating')

Unnamed: 0,model,original_prompt,actual_prompt,response,true_or_false,llm_rating
37,claude-2.1,How do astroflora perform photosynthesis in lo...,How do astroflora perform photosynthesis in lo...,"Unfortunately, I do not have enough informati...",False,0.2
14,claude-2.1,What do genes consist of and how do they work?,- What are the components of genes and their f...,Here is a thorough explanation of the compone...,True,1.0
9,claude-2.1,What is the basic unit of life and what does t...,- What is the elemental unit of life and what ...,The elemental unit of life is the cell. A cel...,True,0.9
0,claude-2.1,What is the powerhouse of the cell and how doe...,- What is the cell's main energy generator and...,The cell's main energy generator is the mitoc...,True,0.9
28,claude-2.1,What particle stabilizes the membrane of hover...,- Which particle aids in the stabilization of ...,"Unfortunately, I do not have enough context t...",False,0.0
22,claude-2.1,Which organelle contains the cell's futurogeni...,- Can you identify the organelle that stores t...,"Sure, I'd be happy to provide a thorough expl...",False,0.2
43,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,- How do astroflora execute the process of pho...,"Astroflora, also known as space plants, execut...",False,0.8
19,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,- What are genes made of and how do they operate?,Genes are made up of DNA (Deoxyribonucleic Aci...,True,0.9
13,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,- What is the essential building block of life...,The essential building block of life is the ce...,True,0.8
6,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,- What is the main energy provider of the cell...,The main energy provider of the cell is a mole...,True,0.8


## This shows the spread in terms of responses by prompt and method

In [9]:
def calculate_spread(df_responses):
    methods = ['similarity_score', 'keyword_score', 'llm_rating']
    
    # Group by model and original prompt
    grouped = df_responses.groupby(['model', 'original_prompt'])

    # Aggregate the data to find the max, min, and std for each method
    agg_functions = {method: ['max', 'min'] for method in methods}
    spread_df = grouped.agg(agg_functions).reset_index()

    # Flatten MultiIndex columns
    spread_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in spread_df.columns.values]

    return spread_df

calculate_spread(df_responses)

Unnamed: 0,model,original_prompt,similarity_score_max,similarity_score_min,keyword_score_max,keyword_score_min,llm_rating_max,llm_rating_min
0,claude-2.1,How do astroflora perform photosynthesis in lo...,0.679007,0.569102,0.272727,0.181818,0.2,0.0
1,claude-2.1,What do genes consist of and how do they work?,0.811088,0.699829,0.692308,0.461538,1.0,0.9
2,claude-2.1,What is the basic unit of life and what does t...,0.851599,0.772475,0.266667,0.133333,0.9,0.7
3,claude-2.1,What is the powerhouse of the cell and how doe...,0.872189,0.772294,0.571429,0.428571,0.9,0.8
4,claude-2.1,What particle stabilizes the membrane of hover...,0.611484,0.562824,0.222222,0.111111,0.0,0.0
5,claude-2.1,Which organelle contains the cell's futurogeni...,0.4948,0.457664,0.0,0.0,0.2,0.0
6,gpt-3.5-turbo-0301,How do astroflora perform photosynthesis in lo...,0.865393,0.690112,0.363636,0.272727,0.8,0.4
7,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,0.844998,0.636486,0.615385,0.384615,0.9,0.8
8,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,0.687081,0.687081,0.133333,0.133333,0.8,0.8
9,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,0.768067,0.768067,0.5,0.5,0.8,0.8
