In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error as mape

In [2]:
claude = pd.read_csv("../../../sustainable-trs-with-rag/european-city-data/rag-sustainability/results/results-combined_prompts/llm-judge/judged_cities_Claude3Point5Sonnet.csv")

In [3]:
gpt = pd.read_csv("../../../sustainable-trs-with-rag/european-city-data/rag-sustainability/results/results-combined_prompts/llm-judge/judged_cities_GPT-4.csv")
gpt.head()

Unnamed: 0,prompt_id,model,prompt,response,response_sustainable,llm_judge_response,llm_judge_response_sustainable,total_rating,total_rating_sustainable
0,0_gpt-4o-mini,llama3point1-instruct,Suggest some places to visit during spring. I ...,"I recommend Kaunas, Lithuania. I recommend Ka...","I recommend Kaunas, Lithuania because it is a...",Feedback:::\nTotal rating: 8.5,Feedback:::\nTotal rating: 9.0,8.5,9.0
1,0_gpt-4o-mini,mistral-instruct,Suggest some places to visit during spring. I ...,1. I recommend Sibiu in Romania.\n\nSibiu is a...,1. I recommend Sibiu in Romania.\n\nSibiu is a...,Feedback:::\nTotal rating: 8.5,Feedback:::\nTotal rating: 8.0,8.5,8.0
2,1_gpt-4o-mini,llama3point1-instruct,"I'm planning a trip in July and enjoy beaches,...","I recommend Copenhagen, Denmark because it ha...",I recommend Copenhagen and why you recommende...,Feedback:::\nTotal rating: 9.0,Feedback:::\nTotal rating: 8.0,9.0,8.0
3,1_gpt-4o-mini,mistral-instruct,"I'm planning a trip in July and enjoy beaches,...",1. Option 1: The city of Thessaloniki in Greec...,1. I recommend Varna for your summer holiday i...,Feedback:::\nTotal rating: 8.5,Feedback:::\nTotal rating: 7.0,8.5,7.0
4,2_gpt-4o-mini,llama3point1-instruct,What are some good destinations for a family v...,I recommend Samsun as the best destination fo...,"I recommend Samsun, Turkey because it has a v...",Feedback:::\nTotal rating: 8.5,Feedback:::\nTotal rating: 6.0,8.5,6.0


In [4]:
def compute_mean(df):
    llama_results = df.loc[df["model"]=="llama3point1-instruct"]
    mistral_results = df.loc[df["model"] == "mistral-instruct"]
    
    print("\t llama results mean (Non sustainable)", llama_results.total_rating.mean())
    print("\t llama results mean (Sustainable)", llama_results.total_rating_sustainable.mean())
    
    print("\t Mistral results mean (Non sustainable)", mistral_results.total_rating.mean())
    print("\t Mistral results mean (Sustainable)", mistral_results.total_rating_sustainable.mean())
    
    
    

In [5]:
def compute_sd(df):
    llama_results = df.loc[df["model"]=="llama3point1-instruct"]
    mistral_results = df.loc[df["model"] == "mistral-instruct"]
    
    print("\t llama results mean (Non sustainable)", llama_results.total_rating.std())
    print("\t llama results mean (Sustainable)", llama_results.total_rating_sustainable.std())
    
    print("\t Mistral results mean (Non sustainable)", mistral_results.total_rating.std())
    print("\t Mistral results mean (Sustainable)", mistral_results.total_rating_sustainable.std())
    
   

In [6]:
print("Mean")
print("GPT-4 as a judge")
compute_mean(gpt)

print("Claude-3.5-sonnet as a judge")
compute_mean(claude)

Mean
GPT-4 as a judge
	 llama results mean (Non sustainable) 8.1625
	 llama results mean (Sustainable) 8.11
	 Mistral results mean (Non sustainable) 3.8525
	 Mistral results mean (Sustainable) 3.78
Claude-3.5-sonnet as a judge
	 llama results mean (Non sustainable) 6.296482412060302
	 llama results mean (Sustainable) 6.3335
	 Mistral results mean (Non sustainable) 3.056
	 Mistral results mean (Sustainable) 3.0275


In [7]:
print("SD")
print("GPT-4 as a judge")
compute_sd(gpt)

print("Claude-3.5-sonnet as a judge")
compute_sd(claude)

SD
GPT-4 as a judge
	 llama results mean (Non sustainable) 1.7806451741379878
	 llama results mean (Sustainable) 1.6290083921541174
	 Mistral results mean (Non sustainable) 2.7233392431288888
	 Mistral results mean (Sustainable) 2.5596599962656
Claude-3.5-sonnet as a judge
	 llama results mean (Non sustainable) 2.278032486275708
	 llama results mean (Sustainable) 2.1422356588597933
	 Mistral results mean (Non sustainable) 2.4549332959241856
	 Mistral results mean (Sustainable) 2.4847259537676654


In [8]:
gpt.columns.tolist()

['prompt_id',
 'model',
 'prompt',
 'response',
 'response_sustainable',
 'llm_judge_response',
 'llm_judge_response_sustainable',
 'total_rating',
 'total_rating_sustainable']

In [12]:
def compute_mape(combined):
    llama = combined.loc[combined["model"]=="llama3point1-instruct"]
    mistral = combined.loc[combined["model"]=="mistral-instruct"]
    mape_llama = mape(llama.total_rating_gpt4, llama.total_rating_claude)
    mape_llama_sustainable = mape(llama.total_rating_sustainable_gpt4, llama.total_rating_sustainable_claude)
    
    mape_mistral = mape(mistral.total_rating_gpt4, mistral.total_rating_claude)
    mape_mistral_sustainable = mape(mistral.total_rating_sustainable_gpt4, mistral.total_rating_sustainable_claude)
    
    print("\t LLama MAPE (Non Sustainable)", mape_llama)
    print("\t LLama MAPE (Sustainable)", mape_llama_sustainable)
    
    print("\n\t Mistral MAPE (Non Sustainable)", mape_mistral)
    print("\t Mistral MAPE (Sustainable)", mape_mistral_sustainable)

In [14]:
columns = ["prompt_id", "model", "total_rating", "total_rating_sustainable"]

try:
    gpt = gpt[columns]
    claude = claude[columns]
except KeyError:
    gpt = gpt
    claude = claude

gpt.rename(columns={"total_rating": "total_rating_gpt4","total_rating_sustainable":"total_rating_sustainable_gpt4" }, inplace=True)
claude.rename(columns={"total_rating": "total_rating_claude","total_rating_sustainable":"total_rating_sustainable_claude" }, inplace=True)

combined = pd.merge(gpt, claude, on=["prompt_id", "model"], how="left")
combined.fillna(0, inplace=True)
combined

Unnamed: 0,prompt_id,model,total_rating_gpt4,total_rating_sustainable_gpt4,total_rating_claude,total_rating_sustainable_claude
0,0_gpt-4o-mini,llama3point1-instruct,8.5,9.0,7.5,7.5
1,0_gpt-4o-mini,mistral-instruct,8.5,8.0,5.5,4.5
2,1_gpt-4o-mini,llama3point1-instruct,9.0,8.0,6.5,3.5
3,1_gpt-4o-mini,mistral-instruct,8.5,7.0,6.5,5.5
4,2_gpt-4o-mini,llama3point1-instruct,8.5,6.0,5.5,3.5
...,...,...,...,...,...,...
395,54_gemini-1.5-pro-001,mistral-instruct,0.0,3.0,0.0,3.5
396,55_gemini-1.5-pro-001,llama3point1-instruct,9.5,8.0,8.5,5.5
397,55_gemini-1.5-pro-001,mistral-instruct,2.0,4.0,2.0,2.0
398,56_gemini-1.5-pro-001,llama3point1-instruct,3.0,2.0,2.5,1.5


In [15]:
compute_mape(combined)

	 LLama MAPE (Non Sustainable) 0.25757350554931824
	 LLama MAPE (Sustainable) 0.24001360372084057

	 Mistral MAPE (Non Sustainable) 0.3465437958584243
	 Mistral MAPE (Sustainable) 0.4222785661835584
