In [1]:
import pandas as pd 
import numpy as np 
import os 
import re
import sys 
import ast 
import json

In [2]:
results_dir = "../../european-city-data/rag-sustainability/results/results-combined_prompts/"
folders = os.listdir(results_dir)
results_df = pd.read_csv(f"{results_dir}/recommended_cities_gemma.csv")
sim_df = pd.read_csv(f"{results_dir}/context_response_similarity_scores.csv")

cities_df = pd.read_csv("../../european-city-data/cities/worldcities.csv")
cities = list(cities_df['city'])

eucities_df = pd.read_csv("../../european-city-data/archive/city_abstracts_embeddings.csv")
eu_cities = list(cities_df['city'])

sar_results_dir = "../../european-city-data/rag-sustainability/results/results-combined_prompts_SAR/"
sar_results = pd.read_csv(f"{sar_results_dir}/recommended_cities_sar_gemma.csv")

sim_sar = pd.read_csv(f"{sar_results_dir}/context_response_similarity_scores.csv")

In [4]:
combined_df = pd.merge(
    left=results_df, 
    right=sar_results,
    how='left',
    left_on=['model', 'prompt_id'],
    right_on=['model', 'prompt_id'],
)

In [5]:
llama = combined_df[results_df['model'] == 'llama3point1-instruct']
mistral = combined_df[results_df['model'] == 'mistral-instruct']
gemma = combined_df[results_df['model'] == 'gemma2']

In [6]:
llama

Unnamed: 0,model,prompt_id,rec_cities,response,rec_cities_sar,response_sar
200,llama3point1-instruct,prompt_17_gemini-ui,['Kaunas'],I recommend Kaunas in Lithuania for your expe...,"['Kaunas', 'Sibiu']",I recommend Kaunas and Sibiu. I recommend the...
201,llama3point1-instruct,prompt_27_gemini-1.5-pro-001,['Copenhagen'],"I recommend Copenhagen, Denmark because it is...","['Kahramanmaras', 'Nevsehir']","I recommend Baia Mare, Kahramanmaras, and Nev..."
202,llama3point1-instruct,prompt_42_gpt-4o-mini,['Amsterdam'],"I recommend Amsterdam, Netherlands. Amsterdam...","['Nalchik', 'Arkhangelsk', 'Amsterdam']","I recommend Nalchik, Arkhangelsk, and Amsterd..."
203,llama3point1-instruct,prompt_0_gpt-4o-mini,"['Kaunas', 'Thessaloniki', 'Strasbourg']","I recommend Kaunas, Lithuania. I recommend Ka...","['Kaunas', 'Oradea', 'Sibiu']","I recommend Kaunas, Oradea, and Sibiu. I reco..."
204,llama3point1-instruct,prompt_44_gemini-1.5-pro-001,['Kaunas'],I recommend Kaunas for your medieval history ...,"['Kaunas', 'Rennes', 'Copenhagen']","I recommend Kaunas, Rennes, and Copenhagen fo..."
...,...,...,...,...,...,...
395,llama3point1-instruct,prompt_29_gpt-4o-mini,['Innsbruck'],"I recommend Innsbruck, Austria. Innsbruck is ...","['Innsbruck', 'Strasbourg']",I recommend Innsbruck and Strasbourg because ...
396,llama3point1-instruct,prompt_44_gemini-ui,['Stavanger'],"I recommend Stavanger, Norway, because it off...","['Nalchik', 'Kaunas', 'Stavanger', 'Erzurum', ...","I recommend Nalchik, Kaunas, and Stavanger. I..."
397,llama3point1-instruct,prompt_59_gpt-4o-mini,"['Munich', 'Thessaloniki', 'Arkhangelsk', 'Gaz...","I recommend Munich, Germany for its vibrant a...","['Oradea', 'Gaziantep', 'Varna']","I recommend Oradea, Gaziantep, and Varna. I r..."
398,llama3point1-instruct,prompt_50_gemini-ui,"['Varna', 'Tallinn', 'London', 'Cork', 'Innsbr...","I recommend Varna, Bulgaria for your weekend ...","['Tallinn', 'Cork']","I recommend Tallinn, Baia Mare, and Cork. I r..."


### What are the sustainability ranks of the recommended cities?

In [None]:
def get_ranks(df):
    city_s_ranks = []
    for i, row in df.iterrows():
        context_path = os.path.join(sar_results_dir, row['model'], row['prompt_id'], 'cities_sustainable.json')
        # print(context_path)
        with open(context_path) as f:
            context = json.load(f)
        
        city_rank = []
        for city in ast.literal_eval(row['rec_cities_sar']):
            for j, rec_city in enumerate(context): 
                if city == rec_city['city']:
                    city_rank.append({
                        'city': city,
                        's-fairness-rank': j+1
                    })
        city_s_ranks.append(city_rank)

    df['sustainability_ranks'] = city_s_ranks
    return df


In [None]:
llama_ranks = get_ranks(llama)
llama_ranks

In [None]:
mistral_ranks = get_ranks(mistral)
mistral_ranks

In [None]:
gemma

In [None]:
gemma_ranks = get_ranks(gemma)
gemma_ranks

Number of prompts where the first city has lowest s-fairness: 

In [None]:
def is_lowest_rank(ranked_list):
    if len(ranked_list):
        min_rank = min(ranked_list, key=lambda x: x['s-fairness-rank'])['s-fairness-rank']

        if ranked_list[0]['s-fairness-rank'] == min_rank:
            return 1
    return 0

In [None]:
llama_min_ranks = sum(llama_ranks['sustainability_ranks'].apply(is_lowest_rank))
llama_min_ranks

In [None]:
mistral_min_ranks = sum(mistral_ranks['sustainability_ranks'].apply(is_lowest_rank))
mistral_min_ranks

In [None]:
gemma_min_ranks = sum(gemma_ranks['sustainability_ranks'].apply(is_lowest_rank))
gemma_min_ranks

Llama outperforms Mistral (2x better) here as well: in 72.5% of the prompts, the top choice has the lowest s-fairness compared to the rest of the recommended cities, NOT retrieved cities (as opposed to 43.5% for Mistral)

In [None]:
def is_top_choice_most_sustainable(ranked_list):
    if len(ranked_list):
        if ranked_list[0]['s-fairness-rank'] == 1:
            return 1
    return 0

In [None]:
l_top_choice = sum(llama_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))
m_top_choice = sum(mistral_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))
g_top_choice = sum(gemma_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))

In [None]:
print(l_top_choice, m_top_choice, g_top_choice)

In [None]:
def most_sustainable(ranked_list):
    if len(ranked_list):
        if ranked_list[0]['s-fairness-rank'] == 1:
            return True
    return False

In [None]:
l_sar = llama_ranks[(llama_ranks['sustainability_ranks'].apply(most_sustainable))]
l_sar

In [None]:
m_sar = mistral_ranks[(mistral_ranks['sustainability_ranks'].apply(most_sustainable))]
m_sar

In [None]:
g_sar = gemma_ranks[(gemma_ranks['sustainability_ranks'].apply(most_sustainable))]
g_sar

In 8 prompts, Llama recommends the most sustainable retrieved city as its top choice, as opposed to 20 prompts in Mistral where this is the case

On average what position does the most sustainable recommended city have? i.e. rank(min(s-fairness-rank))?

In [None]:
def find_lowest_rank(ranked_list):
    if len(ranked_list):
        min_rank = min(ranked_list, key=lambda x: x['s-fairness-rank'])['s-fairness-rank']
        # print("Min Rank:", min_rank)

        for i, cities in enumerate(ranked_list):
            # print(cities)
            if cities['s-fairness-rank'] == min_rank:
                return i+1
    return 0

In [None]:
llama_lowest_ranked = np.mean(llama_ranks['sustainability_ranks'].apply(find_lowest_rank))
mistral_lowest_ranked = np.mean(mistral_ranks['sustainability_ranks'].apply(find_lowest_rank))

In [None]:
print(llama_lowest_ranked, mistral_lowest_ranked)

Average rank of 1st recommendation

In [None]:
llama_avg_rank = np.mean(llama_ranks['sustainability_ranks'].apply(lambda x: x[0]['s-fairness-rank'] if len(x) else 0))
mistral_avg_rank = np.mean(mistral_ranks['sustainability_ranks'].apply(lambda x: x[0]['s-fairness-rank'] if len(x) else 0))

print(llama_avg_rank, mistral_avg_rank)

Mean Average S-Fairness Rank

In [None]:
def avg_rank(rec_cites):
    if len(rec_cites):
        return sum([x['s-fairness-rank'] for x in rec_cites])/len(rec_cites)
    return 0

llama_mar = np.mean(llama_ranks['sustainability_ranks'].apply(avg_rank))
mistral_mar = np.mean(mistral_ranks['sustainability_ranks'].apply(avg_rank))

print(llama_mar, mistral_mar)

The Mean Average Sustainability Rank for Llama is slightly lower for Llama as compared to Mistral 

### For how many prompts do the responses differ when sustainability is added? 

In [None]:
def count_sustainability_responses(df, len_th, comp = 'eq'):
    count_no_common_recs = 0

    for i, row in df.iterrows():
        l1 = set(row['rec_cities'])
        l2 = set(row['rec_cities_sustainable'])

        if comp == "eq":
            if len(l1 & l2) == len_th:
                count_no_common_recs += 1
        elif comp == "lte":
            if len(l1 & l2) <= len_th:
                count_no_common_recs += 1
        elif comp == "gte":
            if len(l1 & l2) >= len_th:
                count_no_common_recs += 1


    # print("Count of records where values of C are the same across different values of A:\n", count_same_values)
    return count_no_common_recs

In [None]:
llama_none_common = count_sustainability_responses(llama, 0)
mistral_none_common = count_sustainability_responses(mistral, 0)

In [None]:
print(llama_none_common, mistral_none_common)

For no prompt are the responses completely different when sustainability is added (i.e. a totally new list of recommended cities). 

## With SAR

In [None]:
llama = combined_df[combined_df['model'] == 'llama3point1-instruct']
mistral = combined_df[combined_df['model'] == 'mistral-instruct']
gemma = combined_df[combined_df['model'] == 'gemma2']

In [None]:
def get_ranks(df):
    city_s_ranks = []
    for i, row in df.iterrows():
        context_path = os.path.join(sar_results_dir, row['model'], row['prompt_id'], 'cities_sustainable.json')
        with open(context_path) as f:
            context = json.load(f)

        city_rank = []
        for city in ast.literal_eval(row['rec_cities_sar']):
            for j, rec_city in enumerate(context): 
                if city == rec_city['city']:
                    city_rank.append({
                        'city': city,
                        's-fairness-rank': j+1
                    })
        city_s_ranks.append(city_rank)

    df['sustainability_ranks'] = city_s_ranks
    return df

In [None]:
llama_ranks = get_ranks(llama)

In [None]:
mistral_ranks = get_ranks(mistral)
mistral_ranks

In [None]:
gemma_ranks = get_ranks(gemma)
gemma_ranks

In [None]:
def is_lowest_rank(ranked_list):
    if len(ranked_list):
        min_rank = min(ranked_list, key=lambda x: x['s-fairness-rank'])['s-fairness-rank']

        if ranked_list[0]['s-fairness-rank'] == min_rank:
            return 1
    return 0

llama_min_ranks = sum(llama_ranks['sustainability_ranks'].apply(is_lowest_rank))
mistral_min_ranks = sum(mistral_ranks['sustainability_ranks'].apply(is_lowest_rank))
gemma_min_ranks = sum(gemma_ranks['sustainability_ranks'].apply(is_lowest_rank))

print(llama_min_ranks, mistral_min_ranks, gemma_min_ranks)

In [None]:
def most_sustainable(ranked_list):
    if len(ranked_list):
        if ranked_list[0]['s-fairness-rank'] == 1:
            return True
    return False

In [None]:
def is_top_choice_most_sustainable(ranked_list):
    if len(ranked_list):
        if ranked_list[0]['s-fairness-rank'] == 1:
            return 1
    return 0

In [None]:
l_top_choice = sum(llama_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))
m_top_choice = sum(mistral_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))
g_top_choice = sum(gemma_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))

print(l_top_choice, m_top_choice, g_top_choice)

In [44]:
def most_sustainable(ranked_list):
    if len(ranked_list):
        if ranked_list[0]['s-fairness-rank'] == 1:
            return True
    return False

In [45]:
def is_top_choice_most_sustainable(ranked_list):
    if len(ranked_list):
        if ranked_list[0]['s-fairness-rank'] == 1:
            return 1
    return 0

In [46]:
l_top_choice = sum(llama_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))
m_top_choice = sum(mistral_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))
g_top_choice = sum(gemma_ranks['sustainability_ranks'].apply(is_top_choice_most_sustainable))

print(l_top_choice, m_top_choice, g_top_choice)

21 15 15


In [36]:
llama_avg_rank = np.mean(llama_ranks['sustainability_ranks'].apply(lambda x: x[0]['s-fairness-rank'] if len(x) else 0))
mistral_avg_rank = np.mean(mistral_ranks['sustainability_ranks'].apply(lambda x: x[0]['s-fairness-rank'] if len(x) else 0))

print(llama_avg_rank, mistral_avg_rank)

5.085 5.015


In [37]:
def avg_rank(rec_cites):
    if len(rec_cites):
        return sum([x['s-fairness-rank'] for x in rec_cites])/len(rec_cites)
    return 0

llama_mar = np.mean(llama_ranks['sustainability_ranks'].apply(avg_rank))
mistral_mar = np.mean(mistral_ranks['sustainability_ranks'].apply(avg_rank))

print(llama_mar, mistral_mar)

5.2435615079365085 4.787412698412698
