# Generate Comparison Report

![OVERALL FLOW PROCESS](../images/workflow.png)

In this notebook, we will compare all eval metrics including cost and latency for the source and target model and generate final report.

### Parameters

Define model ID to use

In [1]:
import sys

sys.path.append("../src/")
from config import *

src_model = "openai"


target_model = "claude"


### Libraries

Import basic libraries

In [2]:
from pricing import *
from config import *
import pandas as pd

## Generate RAGAS Report

In [3]:
def get_report_ragas(model, data):
    model_id = OPENAI_MODEL_ID
    if model == "mistral":
        model_id = MISTRAL_MODEL_ID
    elif model == "claude":
        model_id = CLAUDE_MODEL_ID
    elif model == "meta":
        model_id = META_MODEL_ID
    rec = {}
    rec["model"] = model
    rec["model_id"] = model_id
    #rec["prompt"] = prompt
    rec["answer_precision (mean)"] = data.loc[:, "answer_precision"].mean()
    rec["answer_recall (mean)"] = data.loc[:, "answer_recall"].mean()
    rec["answer_correctness (mean)"] = data.loc[:, "answer_correctness"].mean()
    rec["answer_similarity (mean)"] = data.loc[:, "answer_similarity"].mean()
    rec["answer_precision (median)"] = data.loc[:, "answer_precision"].median()
    rec["answer_recall (median)"] = data.loc[:, "answer_recall"].median()
    rec["answer_correctness (median)"] = data.loc[:, "answer_correctness"].median()
    rec["answer_similarity (median)"] = data.loc[:, "answer_similarity"].median()
    rec["input_tokens(mean)"] = data.loc[:, "input_tokens"].mean()
    rec["output_tokens(mean)"] = data.loc[:, "output_tokens"].mean()

    costs_average = calculate_input_price(
        rec["input_tokens(mean)"], model_id
    ) + calculate_input_price(rec["output_tokens(mean)"], model_id)
    rec["average_cost (USD)"] = costs_average
    
    
    data['total_latency'] = data[['latency_meta_time', 'latency_meta_kwd', 'latency_meta_comb', 'latency_meta_ans_gen']].sum(axis=1)
    rec["latency (mean)"] = data.loc[:, "total_latency"].mean()
    rec["latency (median)"] = data.loc[:, "total_latency"].median()

    df = pd.DataFrame([rec])
    return df

    df = pd.DataFrame([rec])
    return df

### Specify source evaluation results

In [4]:
#src_eval = src_prompt + "_" + src_model

df_src_ragas = pd.read_csv(
    "../outputs/evaluation_reports/ragas/ragas_eval_" + src_model + ".csv", encoding="UTF-8"
)
df_src_ragas.head()

Unnamed: 0,doc_name,doc_link,doc_period,question_type,question,ground_truths,evidence_text,page_number,llm_answer,llm_contexts,latency_meta_time,latency_meta_kwd,latency_meta_comb,latency_meta_ans_gen,input_tokens,output_tokens,answer_precision,answer_recall,answer_correctness,answer_similarity
0,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,What is the FY2018 capital expenditure amount ...,"[""['$1577.00']""]",Table of Contents \n3M Company and Subsidiarie...,60,The Fiscal Year 2018 capital expenditure amoun...,['<<Paragraph>> [Source File: 3M_2018_10K] \n ...,4.99046,1.629424,4.226844,5.914828,26221,280,0.333333,1.0,0.24687,0.160406
1,3M_2022_10K,https://investors.3m.com/financials/sec-filing...,2022,domain-relevant,Is 3M a capital-intensive business based on FY...,"[""['No, the company is managing its CAPEX and ...",3M Company and Subsidiaries\n Consolidated Sta...,485052,"Yes, 3M is a capital-intensive business based ...","[""<<Paragraph>> [Source File: 3M_2022_10K] \n ...",3.524989,1.346899,4.587813,9.920022,29931,335,0.0,0.0,0.177496,0.354993
2,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,domain-relevant,Does Adobe have an improving operating margin ...,"[""['No the operating margins of Adobe have rec...",ADOBE INC.\nCONSOLIDATED STATEMENTS OF INCOME\...,54,Adobe's operating margin profile for fiscal ye...,"[""<<Paragraph>> [Source File: AMD_2022_10K] \n...",3.909886,1.441947,4.783492,18.509396,32576,543,0.571429,0.0,0.39065,0.352729
3,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,novel-generated,Does Adobe have an improving Free cashflow con...,"[""['Yes, the FCF conversion (using net income ...",ADOBE INC.\n CONSOLIDATED STATEMENTS OF CASH F...,57,Adobe's Free Cash Flow (FCF) conversion for fi...,['<<Paragraph>> [Source File: 3M_2022_10K] \n ...,3.856029,1.393259,4.001557,14.289113,22079,445,0.666667,0.0,0.646074,0.625482
4,AMD_2015_10K,https://ir.amd.com/sec-filings/filter/annual-f...,2015,metrics-generated,Answer the following question as if you are an...,"[""['4.2%']""]",ITEM 8.\nFINANCIAL STATEMENTS AND SUPPLEMENTAR...,5660,To calculate the FY2015 Depreciation and Amort...,['<<Paragraph>> [Source File: 3M_2022_10K] \n ...,3.839135,2.39158,5.642941,16.71771,24301,601,0.333333,0.0,0.162183,0.324365


In [5]:
src_report_ragas = get_report_ragas(src_model, df_src_ragas)
src_report_ragas

Unnamed: 0,model,model_id,answer_precision (mean),answer_recall (mean),answer_correctness (mean),answer_similarity (mean),answer_precision (median),answer_recall (median),answer_correctness (median),answer_similarity (median),input_tokens(mean),output_tokens(mean),average_cost (USD),latency (mean),latency (median)
0,openai,gpt-4-turbo,0.365476,0.4,0.341078,0.364299,0.333333,0.0,0.231155,0.353861,25558.8,400.0,0.259588,22.139643,21.45984


### Specify target evaluation results

In [6]:
#target_eval = target_prompt + "_" + target_model

df_dest_ragas = pd.read_csv(
    "../outputs/evaluation_reports/ragas/ragas_eval_" + target_model + ".csv", encoding="UTF-8"
)
df_dest_ragas.head()

Unnamed: 0,doc_name,doc_link,doc_period,question_type,question,ground_truths,evidence_text,page_number,llm_answer,llm_contexts,latency_meta_time,latency_meta_kwd,latency_meta_comb,latency_meta_ans_gen,input_tokens,output_tokens,answer_precision,answer_recall,answer_correctness,answer_similarity
0,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,What is the FY2018 capital expenditure amount ...,"[""['$1577.00']""]",Table of Contents \n3M Company and Subsidiarie...,60,According to the cash flow statement in the 3M...,['<<Paragraph>> [Source File: 3M_2018_10K] \n ...,0.927055,0.606658,1.448762,2.483714,21147,401,0.0,1.0,0.168175,0.336351
1,3M_2022_10K,https://investors.3m.com/financials/sec-filing...,2022,domain-relevant,Is 3M a capital-intensive business based on FY...,"[""['No, the company is managing its CAPEX and ...",3M Company and Subsidiaries\n Consolidated Sta...,485052,Based on the financial information provided in...,['<<Paragraph>> [Source File: 3M_2022_10K] \n ...,0.714294,0.503733,1.938756,5.232772,23180,635,0.0,0.0,0.184868,0.369736
2,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,domain-relevant,Does Adobe have an improving operating margin ...,"[""['No the operating margins of Adobe have rec...",ADOBE INC.\nCONSOLIDATED STATEMENTS OF INCOME\...,54,Based on the financial information provided in...,['<<Paragraph>> [Source File: ADOBE_2022_10K] ...,0.774023,3.742355,1.554743,4.361795,13324,1038,0.2,0.0,0.306414,0.412829
3,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,novel-generated,Does Adobe have an improving Free cashflow con...,"[""['Yes, the FCF conversion (using net income ...",ADOBE INC.\n CONSOLIDATED STATEMENTS OF CASH F...,57,Based on the financial information provided in...,['<<Paragraph>> [Source File: ADOBE_2022_10K] ...,0.891798,0.576164,1.379593,3.781731,12957,526,0.0,0.0,0.198792,0.397583
4,AMD_2015_10K,https://ir.amd.com/sec-filings/filter/annual-f...,2015,metrics-generated,Answer the following question as if you are an...,"[""['4.2%']""]",ITEM 8.\nFINANCIAL STATEMENTS AND SUPPLEMENTAR...,5660,According to the details in the Profit and Los...,['<<Paragraph>> [Source File: AMD_2015_10K] \n...,0.821045,0.420084,1.686061,2.92591,24266,494,0.0,0.0,0.146506,0.293012


In [7]:
dest_report_ragas = get_report_ragas(target_model, df_dest_ragas)
dest_report_ragas

Unnamed: 0,model,model_id,answer_precision (mean),answer_recall (mean),answer_correctness (mean),answer_similarity (mean),answer_precision (median),answer_recall (median),answer_correctness (median),answer_similarity (median),input_tokens(mean),output_tokens(mean),average_cost (USD),latency (mean),latency (median)
0,claude,anthropic.claude-3-haiku-20240307-v1:0,0.186667,0.45,0.268005,0.349344,0.0,0.25,0.19183,0.353043,18693.3,613.6,0.004827,7.555861,6.935292


In [8]:
final_report_ragas = pd.concat([src_report_ragas, dest_report_ragas])
final_report_ragas

Unnamed: 0,model,model_id,answer_precision (mean),answer_recall (mean),answer_correctness (mean),answer_similarity (mean),answer_precision (median),answer_recall (median),answer_correctness (median),answer_similarity (median),input_tokens(mean),output_tokens(mean),average_cost (USD),latency (mean),latency (median)
0,openai,gpt-4-turbo,0.365476,0.4,0.341078,0.364299,0.333333,0.0,0.231155,0.353861,25558.8,400.0,0.259588,22.139643,21.45984
0,claude,anthropic.claude-3-haiku-20240307-v1:0,0.186667,0.45,0.268005,0.349344,0.0,0.25,0.19183,0.353043,18693.3,613.6,0.004827,7.555861,6.935292


## Generate DeepEval Report

In [9]:
def get_report_deepeval(model, data):
    model_id = OPENAI_MODEL_ID
    if model == "mistral":
        model_id = MISTRAL_MODEL_ID
    elif model == "claude":
        model_id = CLAUDE_MODEL_ID
    elif model == "meta":
        model_id = META_MODEL_ID
    rec = {}
    rec["model"] = model
    rec["model_id"] = model_id
    #rec["prompt"] = prompt
    rec["answer_relevance (mean)"] = data.loc[:, "answer_relevance"].mean()
    rec["faithfulness (mean)"] = data.loc[:, "faithfulness"].mean()
    rec["bias (mean)"] = data.loc[:, "bias"].mean()
    rec["toxicity (mean)"] = data.loc[:, "toxicity"].mean()
    rec["answer_relevance (median)"] = data.loc[:, "answer_relevance"].median()
    rec["faithfulness (median)"] = data.loc[:, "faithfulness"].median()
    rec["bias (median)"] = data.loc[:, "bias"].median()
    rec["toxicity (median)"] = data.loc[:, "toxicity"].median()
    rec["input_tokens(mean)"] = data.loc[:, "input_tokens"].mean()
    rec["output_tokens(mean)"] = data.loc[:, "output_tokens"].mean()

    costs_average = calculate_input_price(
        rec["input_tokens(mean)"], model_id
    ) + calculate_input_price(rec["output_tokens(mean)"], model_id)
    rec["average_cost (USD)"] = costs_average
    

    data['total_latency'] = data[['latency_meta_time', 'latency_meta_kwd', 'latency_meta_comb', 'latency_meta_ans_gen']].sum(axis=1)
    rec["latency (mean)"] = data.loc[:, "total_latency"].mean()
    rec["latency (median)"] = data.loc[:, "total_latency"].median()

    df = pd.DataFrame([rec])
    return df

### Specify source evaluation results

In [10]:
#src_eval = src_prompt + "_" + src_model
src_eval_df_deepeval = pd.read_csv(
    "../outputs/evaluation_reports/deepeval/deepeval_" + src_model + ".csv", encoding="UTF-8"
)
src_eval_df_deepeval.head()

Unnamed: 0,doc_name,doc_link,doc_period,question_type,question,ground_truths,evidence_text,page_number,llm_answer,llm_contexts,latency_meta_time,latency_meta_kwd,latency_meta_comb,latency_meta_ans_gen,input_tokens,output_tokens,answer_relevance,faithfulness,bias,toxicity
0,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,What is the FY2018 capital expenditure amount ...,['$1577.00'],Table of Contents \n3M Company and Subsidiarie...,60,The Fiscal Year 2018 capital expenditure amoun...,['<<Paragraph>> [Source File: 3M_2018_10K] \n ...,4.99046,1.629424,4.226844,5.914828,26221,280,1.0,1.0,0.0,0.0
1,3M_2022_10K,https://investors.3m.com/financials/sec-filing...,2022,domain-relevant,Is 3M a capital-intensive business based on FY...,"['No, the company is managing its CAPEX and Fi...",3M Company and Subsidiaries\n Consolidated Sta...,485052,"Yes, 3M is a capital-intensive business based ...","[""<<Paragraph>> [Source File: 3M_2022_10K] \n ...",3.524989,1.346899,4.587813,9.920022,29931,335,1.0,1.0,0.0,0.0
2,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,domain-relevant,Does Adobe have an improving operating margin ...,['No the operating margins of Adobe have recen...,ADOBE INC.\nCONSOLIDATED STATEMENTS OF INCOME\...,54,Adobe's operating margin profile for fiscal ye...,"[""<<Paragraph>> [Source File: AMD_2022_10K] \n...",3.909886,1.441947,4.783492,18.509396,32576,543,1.0,1.0,0.0,0.0
3,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,novel-generated,Does Adobe have an improving Free cashflow con...,"['Yes, the FCF conversion (using net income as...",ADOBE INC.\n CONSOLIDATED STATEMENTS OF CASH F...,57,Adobe's Free Cash Flow (FCF) conversion for fi...,['<<Paragraph>> [Source File: 3M_2022_10K] \n ...,3.856029,1.393259,4.001557,14.289113,22079,445,0.428571,1.0,0.0,0.0
4,AMD_2015_10K,https://ir.amd.com/sec-filings/filter/annual-f...,2015,metrics-generated,Answer the following question as if you are an...,['4.2%'],ITEM 8.\nFINANCIAL STATEMENTS AND SUPPLEMENTAR...,5660,To calculate the FY2015 Depreciation and Amort...,['<<Paragraph>> [Source File: 3M_2022_10K] \n ...,3.839135,2.39158,5.642941,16.71771,24301,601,0.545455,1.0,0.0,0.0


In [11]:
src_report_deepeval = get_report_deepeval(src_model, src_eval_df_deepeval)
src_report_deepeval

Unnamed: 0,model,model_id,answer_relevance (mean),faithfulness (mean),bias (mean),toxicity (mean),answer_relevance (median),faithfulness (median),bias (median),toxicity (median),input_tokens(mean),output_tokens(mean),average_cost (USD),latency (mean),latency (median)
0,openai,gpt-4-turbo,0.87518,1.0,0.0,0.0,1.0,1.0,0.0,0.0,25558.8,400.0,0.259588,22.139643,21.45984


### Specify target evaluation results

In [12]:
#target_eval = target_prompt + "_" + target_model
target_eval_df_deepeval = pd.read_csv(
    "../outputs/evaluation_reports/deepeval/deepeval_" + target_model + ".csv", encoding="UTF-8"
)
target_eval_df_deepeval.head()

Unnamed: 0,doc_name,doc_link,doc_period,question_type,question,ground_truths,evidence_text,page_number,llm_answer,llm_contexts,latency_meta_time,latency_meta_kwd,latency_meta_comb,latency_meta_ans_gen,input_tokens,output_tokens,answer_relevance,faithfulness,bias,toxicity
0,3M_2018_10K,https://investors.3m.com/financials/sec-filing...,2018,metrics-generated,What is the FY2018 capital expenditure amount ...,['$1577.00'],Table of Contents \n3M Company and Subsidiarie...,60,According to the cash flow statement in the 3M...,['<<Paragraph>> [Source File: 3M_2018_10K] \n ...,0.927055,0.606658,1.448762,2.483714,21147,401,0.8,1.0,0.0,0.0
1,3M_2022_10K,https://investors.3m.com/financials/sec-filing...,2022,domain-relevant,Is 3M a capital-intensive business based on FY...,"['No, the company is managing its CAPEX and Fi...",3M Company and Subsidiaries\n Consolidated Sta...,485052,Based on the financial information provided in...,['<<Paragraph>> [Source File: 3M_2022_10K] \n ...,0.714294,0.503733,1.938756,5.232772,23180,635,0.75,1.0,0.0,0.0
2,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,domain-relevant,Does Adobe have an improving operating margin ...,['No the operating margins of Adobe have recen...,ADOBE INC.\nCONSOLIDATED STATEMENTS OF INCOME\...,54,Based on the financial information provided in...,['<<Paragraph>> [Source File: ADOBE_2022_10K] ...,0.774023,3.742355,1.554743,4.361795,13324,1038,0.666667,1.0,0.0,0.0
3,ADOBE_2022_10K,https://www.adobe.com/pdf-page.html?pdfTarget=...,2022,novel-generated,Does Adobe have an improving Free cashflow con...,"['Yes, the FCF conversion (using net income as...",ADOBE INC.\n CONSOLIDATED STATEMENTS OF CASH F...,57,Based on the financial information provided in...,['<<Paragraph>> [Source File: ADOBE_2022_10K] ...,0.891798,0.576164,1.379593,3.781731,12957,526,0.888889,1.0,0.0,0.0
4,AMD_2015_10K,https://ir.amd.com/sec-filings/filter/annual-f...,2015,metrics-generated,Answer the following question as if you are an...,['4.2%'],ITEM 8.\nFINANCIAL STATEMENTS AND SUPPLEMENTAR...,5660,According to the details in the Profit and Los...,['<<Paragraph>> [Source File: AMD_2015_10K] \n...,0.821045,0.420084,1.686061,2.92591,24266,494,0.8,1.0,0.0,0.0


In [13]:
dest_report_deepeval = get_report_deepeval(target_model, target_eval_df_deepeval)
dest_report_deepeval

Unnamed: 0,model,model_id,answer_relevance (mean),faithfulness (mean),bias (mean),toxicity (mean),answer_relevance (median),faithfulness (median),bias (median),toxicity (median),input_tokens(mean),output_tokens(mean),average_cost (USD),latency (mean),latency (median)
0,claude,anthropic.claude-3-haiku-20240307-v1:0,0.800556,1.0,0.0,0.0,0.8,1.0,0.0,0.0,18693.3,613.6,0.004827,7.555861,6.935292


In [14]:
final_report_deepeval = pd.concat([src_report_deepeval, dest_report_deepeval])
final_report_deepeval

Unnamed: 0,model,model_id,answer_relevance (mean),faithfulness (mean),bias (mean),toxicity (mean),answer_relevance (median),faithfulness (median),bias (median),toxicity (median),input_tokens(mean),output_tokens(mean),average_cost (USD),latency (mean),latency (median)
0,openai,gpt-4-turbo,0.87518,1.0,0.0,0.0,1.0,1.0,0.0,0.0,25558.8,400.0,0.259588,22.139643,21.45984
0,claude,anthropic.claude-3-haiku-20240307-v1:0,0.800556,1.0,0.0,0.0,0.8,1.0,0.0,0.0,18693.3,613.6,0.004827,7.555861,6.935292


### Comparison on metrics, latency and cost. Based on these factors, decide to migrate or not.