## The purpose of this is to demo the pipeline and analytics functionality of LLMResponseMetrics. Please see the readme for more information. 


In [1]:
from functions import *  # this is pulling functions from the function.py file in the code subfolder
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import string
import itertools

## This is the code for running the core pipeline functionality

In [3]:
models_dict = {
    #'claude-2.1':  "ANTHROPIC", 
    'gpt-3.5-turbo-0301': "OPENAI"
               }  # these are the models that you want to actually test
csv_file_path = '../data/prompt_target_answer_pairs.csv' # set this filepath to the file that contains your question-answer pairs
similarity_model_name = 'sentence-transformers/paraphrase-mpnet-base-v2' # this is what you use for analyzing semantic similarity
temperature = "variable" # you can set this to a number between 0 and 1 if you don't want to vary temperature for the model responses
is_file_path = True # you can set this to false if you want to input data directly instead of via a file
llm_evaluation_model = ['gpt-4', "OPENAI"] # this is the model that will compare your target answer to the actual responses
instructions = "Please answer thoroughly: "
perturbation_model = ['gpt-4', "OPENAI"] # I recommend using a good model for perturbations otherwise it may generate the wrong number
stability_threshold= 2 # this is the number of period that the maximum score will have to be stable across all evaluation criteria before each prompt will stop running
max_runs= 3  # this is the maximum number of runs that each prompt will get run if the stability threshold is never met

pipeline = LLMAnalysisPipeline(
    input_data=csv_file_path, 
    models_dict=models_dict, 
    perturbation_model=perturbation_model, 
    llm_evaluation_model=llm_evaluation_model,
    temperature = temperature,
    max_runs= max_runs,
    is_file_path = is_file_path,
    similarity_model_name = similarity_model_name,
    instructions = instructions,
    stability_threshold = stability_threshold

)

# Run the pipeline
df_responses = pipeline.run_pipeline()

2024-01-03 12:06:24,366 - INFO - Reading prompts from CSV file.
2024-01-03 12:06:34,081 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-03 12:06:34,089 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2024-01-03 12:06:53,633 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-03 12:06:53,637 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2024-01-03 12:07:08,869 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-03 12:07:08,873 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2024-01-03 12:07:32,150 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-03 12:07:32,174 - INFO - API call successful. Model: gpt-4, Provider: OPENAI
2024-01-03 12:07:58,147 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-03 12:07:58,154 - INFO 

## This is what the response table looks like

In [4]:
df_responses

Unnamed: 0,model,original_prompt,response,temperature,actual_prompt,run_number,similarity_score,response_embedding,keyword_score,llm_rating,keywords
0,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,The powerhouse of the cell is referred to as t...,0.983085,What is the powerhouse of the cell and how doe...,0,0.894511,"[0.023045534268021584, -0.2238667607307434, 0....",0.714286,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
1,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,The powerhouse of the cell is the mitochondria...,0.413742,What is the powerhouse of the cell and how doe...,1,0.917254,"[0.0028413068503141403, -0.1677742451429367, 0...",0.5,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
2,gpt-3.5-turbo-0301,What is the powerhouse of the cell and how doe...,"Yes, the cell's power station is called the mi...",0.290401,- Can you describe the cell's power station an...,2,0.89678,"[-0.007078335154801607, -0.21583737432956696, ...",0.571429,0.9,"[mitochondrion, mitochondria, atp, adenosine t..."
3,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The basic unit of life is the cell. This means...,0.215484,What is the basic unit of life and what does t...,0,0.933406,"[0.01846599206328392, -0.30291128158569336, 0....",0.133333,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."
4,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,The essential unit of life is the cell. Cells ...,0.405299,- What is the essential unit of life and what ...,1,0.814959,"[-0.03705814108252525, -0.2848859429359436, 0....",0.266667,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."
5,gpt-3.5-turbo-0301,What is the basic unit of life and what does t...,"As an AI language model, I can help you answer...",0.560118,- Can you identify the primary unit of life an...,2,0.815886,"[0.05861984193325043, -0.30094394087791443, 0....",0.333333,0.8,"[cell, biology, prokaryotic cells, eukaryotic ..."
6,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,The constituents of genes are DNA (deoxyribonu...,0.649924,- What are the constituents of genes and how d...,0,0.772437,"[-0.06517020612955093, 0.046741340309381485, 0...",0.461538,0.9,"[genes, dna, deoxyribonucleic acid, nucleotide..."
7,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,Genes are segments of DNA that contain the ins...,0.424485,- What constitutes genes and how do they perfo...,1,0.802081,"[-0.11767256259918213, -0.059758543968200684, ...",0.307692,0.9,"[genes, dna, deoxyribonucleic acid, nucleotide..."
8,gpt-3.5-turbo-0301,What do genes consist of and how do they work?,Genes are segments of DNA (deoxyribonucleic ac...,0.325109,- What makes up genes and how do they function?,2,0.868422,"[-0.032052263617515564, -0.003288829233497381,...",0.769231,1.0,"[genes, dna, deoxyribonucleic acid, nucleotide..."
9,gpt-3.5-turbo-0301,Which organelle contains the cell's futurogeni...,The organelle that contains the cell's future ...,0.84846,- Can you specify the organelle that contains ...,0,0.498125,"[-0.03843337297439575, 0.00022915206500329077,...",0.0,0.2,"[chronocytum, futurogenic material, temporal a..."


## This shows how many times each prompt ran and which run produced the best score

In [None]:
def analyze_best_scores(df_responses):
    methods = ['similarity_score', 'keyword_score', 'llm_rating']
    
    # Group by model, original prompt, and run number, then compute the max score for each method
    max_scores = df_responses.groupby(['model', 'original_prompt', 'run_number'])[methods].max()

    # Find the run number with the highest score for each method per prompt and model
    best_runs = max_scores.groupby(level=[0, 1]).idxmax()

    # Extract the run number and create a DataFrame
    best_run_info = best_runs.map(lambda x: x[2] if pd.notna(x) else None)
    best_run_info.columns = [f'best_run_{method}' for method in methods]

    # Add total run count for each prompt and model
    total_runs = df_responses.groupby(['model', 'original_prompt'])['run_number'].nunique()
    best_run_info['total_runs'] = total_runs

    best_run_info.reset_index(inplace=True)
    best_run_info.rename(columns={'original_prompt': 'prompt'}, inplace=True)

    return best_run_info

df_best_scores = analyze_best_scores(df_responses)
df_best_scores


## This shows the best response for each prompt by similarity score

In [None]:
aggregate_best_scores(df_responses, "similarity_score")

## This shows the best response for each prompt by keyword score

In [None]:
aggregate_best_scores(df_responses, 'keyword_score')

## This shows the best response for each prompt by LLM rating

In [None]:
aggregate_best_scores(df_responses, 'llm_rating')

## This shows the spread in terms of responses by prompt and method

In [None]:
def calculate_spread(df_responses):
    methods = ['similarity_score', 'keyword_score', 'llm_rating']
    
    # Group by model and original prompt
    grouped = df_responses.groupby(['model', 'original_prompt'])

    # Aggregate the data to find the max, min, and std for each method
    agg_functions = {method: ['max', 'min'] for method in methods}
    spread_df = grouped.agg(agg_functions).reset_index()

    # Flatten MultiIndex columns
    spread_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in spread_df.columns.values]

    return spread_df

calculate_spread(df_responses)

### Example Graph: Max Scores by Run

In [None]:
def create_cumulative_max_graph(df, eval_metrics):
    # Create a copy of the DataFrame to avoid modifying the original one
    df_copy = df.copy()

    # Define color sequence
    bold_colors = px.colors.qualitative.Bold
    color_cycle = itertools.cycle(bold_colors)

    # Calculate cumulative max for each metric
    for metric in eval_metrics:
        df_copy.loc[:, metric] = df_copy[metric].cummax()

    # Create line graph
    fig = go.Figure()
    for metric in eval_metrics:
        fig.add_trace(go.Scatter(
            x=df_copy['run_number'], 
            y=df_copy[metric], 
            mode='lines+markers', 
            name=metric,
            line=dict(color=next(color_cycle))
        ))

    # Update graph layout for better aesthetics
    fig.update_layout(
        title='Cumulative Max Scores by Run Number',
        xaxis_title='Run Number',
        yaxis_title='Cumulative Max Score',
        legend_title='Metrics',
        plot_bgcolor='white',
        xaxis=dict(showline=True, showgrid=False, linecolor='black'),
        yaxis=dict(showline=True, showgrid=False, linecolor='black'),
        font=dict(size=16),
    )

    # Show plot
    fig.show()
first_prompt = df_responses.loc[df_responses['original_prompt']==df_responses['original_prompt'].iloc[10]]
# List of evaluation metrics
eval_metrics = ['similarity_score', 'keyword_score', 'llm_rating']

# Create and show the plot
create_cumulative_max_graph(first_prompt, eval_metrics)
