# Generate Comparison Report

In this notebook, we will compare all eval metrics including cost and latency for the source and target model and generate final report.

![CompareAndSelect.png](../images/CompareAndSelect.png)

### Parameters

Define model ID to use

In [3]:
import sys

sys.path.append("../src/")
from config import *

src_model = "openai"
src_prompt = "raw"

target_model = "anthropic"
target_prompt = "optimized"
# target_prompt="raw"

### Libraries

Import basic libraries

In [5]:
from pricing import *
from config import *
import pandas as pd

In [6]:
def get_report(model, prompt, data):
    rec = {}
    rec["model"] = model
    rec["prompt"] = prompt
    rec["alignment (mean)"] = data.loc[:, "alignment"].mean()
    rec["coverage (mean)"] = data.loc[:, "coverage"].mean()
    rec["alignment (median)"] = data.loc[:, "alignment"].median()
    rec["coverage (median)"] = data.loc[:, "coverage"].median()
    rec["input_tokens"] = data.loc[:, "metric_summary_input_tokens"].sum()
    rec["output_tokens"] = data.loc[:, "metric_summary_output_tokens"].sum()
    rec["output_tokens"] = data.loc[:, "metric_summary_output_tokens"].sum()

    model_id = OPENAI_MODEL_ID
    if model == "mistral":
        model_id = MISTRAL_MODEL_ID
    elif model == "anthropic":
        model_id = CLAUDE_MODEL_ID
    elif model == "meta":
        model_id = META_MODEL_ID

    costs = calculate_input_price(
        rec["input_tokens"], model_id
    ) + calculate_input_price(rec["output_tokens"], model_id)
    rec["costs (USD)"] = costs

    rec["latency (mean)"] = data.loc[:, "metric_summary_latency"].mean()
    rec["latency (median)"] = data.loc[:, "metric_summary_latency"].median()

    df = pd.DataFrame([rec])
    return df

### Specify source evaluation results

In [7]:
src_eval = src_prompt + "_" + src_model

df = pd.read_csv(
    "../outputs/call_summarization_eval_" + src_eval + ".csv", encoding="UTF-8"
)
df.head()

Unnamed: 0,transcripts,summary,alignment,coverage,overal_score,metric_summary_input_tokens,metric_summary_output_tokens,metric_summary_latency
0,"[""\nAgent: Good morning, thank you for calling...","Sarah, the customer, called SB Bank to inquire...",0.5,0.8,0.5,767,150,4.529721
1,"[""Agent: Good morning, thank you for calling S...","Sarah Thompson, a customer who applied for a c...",1.0,0.5,0.5,635,184,11.220361
2,"[""\nAgent: Good morning, thank you for calling...","The customer, Sarah, was affected by recent fl...",1.0,1.0,1.0,676,124,7.740885
3,"[""\nAgent: Good morning, thank you for calling...","The customer, Sarah, was incorrectly charged a...",0.666667,1.0,0.666667,680,146,8.956748
4,"[""\nAgent: Good morning, thank you for calling...",Sarah Thompson reported a fraudulent transacti...,0.75,1.0,0.75,850,160,4.660645


In [8]:
src_report = get_report(src_model, src_prompt, df)
src_report

Unnamed: 0,model,prompt,alignment (mean),coverage (mean),alignment (median),coverage (median),input_tokens,output_tokens,costs (USD),latency (mean),latency (median)
0,mistral,raw,0.783333,0.86,0.75,1.0,3608,764,0.034976,7.421672,7.740885


### Specify target evaluation results

In [10]:
target_eval = target_prompt + "_" + target_model

df = pd.read_csv(
    "../outputs/call_summarization_eval_" + target_eval + ".csv", encoding="UTF-8"
)
df.head()

Unnamed: 0,transcripts,summary,alignment,coverage,overal_score,metric_summary_input_tokens,metric_summary_output_tokens,metric_summary_latency
0,"\nAgent: Good morning, thank you for calling S...",The transcript summarizes a conversation betwe...,1.0,1.0,1.0,816,256,2.131492
1,"Agent: Good morning, thank you for calling SB ...",The summary of the transcript is that the cust...,1.0,0.5,0.5,678,256,2.66062
2,"\nAgent: Good morning, thank you for calling S...",The transcript summarizes a call between a cus...,0.666667,1.0,0.666667,701,256,2.843184
3,"\nAgent: Good morning, thank you for calling S...",The summary of the transcript is that the cust...,1.0,1.0,1.0,733,256,2.811654
4,"\nAgent: Good morning, thank you for calling S...",The summary of the transcript is that the cust...,0.5,1.0,0.5,880,238,2.3392


In [12]:
dest_report = get_report(target_model, target_prompt, df)
dest_report

Unnamed: 0,model,prompt,alignment (mean),coverage (mean),alignment (median),coverage (median),input_tokens,output_tokens,costs (USD),latency (mean),latency (median)
0,anthropic,optimized,0.833333,0.9,1.0,1.0,3808,1262,0.01521,2.55723,2.66062


In [13]:
final_report = pd.concat([src_report, dest_report])
final_report

Unnamed: 0,model,prompt,alignment (mean),coverage (mean),alignment (median),coverage (median),input_tokens,output_tokens,costs (USD),latency (mean),latency (median)
0,mistral,raw,0.783333,0.86,0.75,1.0,3608,764,0.034976,7.421672,7.740885
0,anthropic,optimized,0.833333,0.9,1.0,1.0,3808,1262,0.01521,2.55723,2.66062


### Compare on coverage/alignment, latency and cost. Based on these factors, decide to migrate or not.