In [31]:
import pandas as pd 
import numpy as np 
import os 
import re
import sys 
import ast 

In [32]:
results_dir = "../../../european-city-data/rag-sustainability/results/results-combined_prompts/"
folders = os.listdir(results_dir)
results_df = pd.read_csv(f"{results_dir}/recommended_cities.csv")

cities_df = pd.read_csv("../../../european-city-data/cities/worldcities.csv")
cities = list(cities_df['city'])

eucities_df = pd.read_csv("../../../european-city-data/archive/city_abstracts_embeddings.csv")
eu_cities = list(cities_df['city'])

In [33]:
results_df.head()

Unnamed: 0,model,prompt_id,rec_cities,response,rec_cities_sustainable,response_sustainable
0,llama3point1-instruct,prompt_17_gemini-ui,['Kaunas'],I recommend Kaunas in Lithuania for your expe...,['Vitoria-Gasteiz'],I recommend Vitoria-Gasteiz because it is a c...
1,llama3point1-instruct,prompt_27_gemini-1.5-pro-001,['Copenhagen'],"I recommend Copenhagen, Denmark because it is...",['Copenhagen'],I recommend Copenhagen and why I recommended ...
2,llama3point1-instruct,prompt_42_gpt-4o-mini,['Amsterdam'],"I recommend Amsterdam, Netherlands. Amsterdam...","['Nalchik', 'Varna', 'Trabzon']","I recommend Nalchik, Russia because of its be..."
3,llama3point1-instruct,prompt_0_gpt-4o-mini,"['Kaunas', 'Greece', 'Strasbourg']","I recommend Kaunas, Lithuania. I recommend Ka...","['Kaunas', 'Oradea', 'Strasbourg', 'Greece', '...","I recommend Kaunas, Lithuania because it is a..."
4,llama3point1-instruct,prompt_44_gemini-1.5-pro-001,['Kaunas'],I recommend Kaunas for your medieval history ...,['Kaunas'],"I recommend Kaunas, Lithuania because it has ..."


In [34]:
# Compute the average number of cities from our database that have been recommended by each model 

llama = results_df[results_df['model'] == 'llama3point1-instruct']
mistral = results_df[results_df['model'] == 'mistral-instruct']

### How many cities does each model recommend on average? 

For reference: the prompt asked each model to recommend 3 cities to the user 

In [35]:
# Average number of cities per prompt - Llama

llama_avg_cities = np.mean(llama['rec_cities'].apply(lambda x: len(ast.literal_eval(x))))
print(llama_avg_cities)

llama_avg_cities_sus = np.mean(llama['rec_cities_sustainable'].apply(lambda x: len(ast.literal_eval(x))))
print(llama_avg_cities_sus)



2.875
2.7


In [36]:
# Average number of cities per prompt - Mistral

mistral_avg_cities = np.mean(mistral['rec_cities'].apply(lambda x: len(ast.literal_eval(x))))
print(mistral_avg_cities)

mistral_avg_cities_sus = np.mean(mistral['rec_cities_sustainable'].apply(lambda x: len(ast.literal_eval(x))))
print(mistral_avg_cities_sus)

6.66
5.98


Here, we see that Llama-3.1, which recommends 2.9 cities on average and 2.7 with sustainability (could be rounded up to 3) does a better job at sticking to the prompt requirements than Mistral, which recommends 6-7 cities on average

### How many out-of-context cities are recommended to the user? 

For reference: we have 160 European cities in our database 

In [46]:
def avg_city_not_in_db(rec_cities, ref_cities = eu_cities):
    size = len(ast.literal_eval(rec_cities))
    if size == 0: 
        # return 0
        return []
    
    cities_in_db = [city for city in ast.literal_eval(rec_cities) if city not in ref_cities]
    # cities_in_db = sum(1 for city in ast.literal_eval(rec_cities) if city not in ref_cities)
    return cities_in_db



# avg_llama = llama['rec_cities'].apply(avg_city_not_in_db)
# avg_llama_sus = llama['rec_cities_sustainable'].apply(avg_city_not_in_db)

avg_mistral = mistral['rec_cities'].apply(avg_city_not_in_db)
avg_mistral_sus = mistral['rec_cities_sustainable'].apply(avg_city_not_in_db)

# avg_table = np.array([avg_llama, avg_llama_sus, avg_mistral, avg_mistral_sus]).reshape(2, 2)
# print(avg_table)

print(avg_mistral)



200    []
201    []
202    []
203    []
204    []
       ..
395    []
396    []
397    []
398    []
399    []
Name: rec_cities, Length: 200, dtype: object


In [27]:
def flatten_list(cities_list):
     return set([city for cities in cities_list for city in ast.literal_eval(cities)])

def cities_not_in_db(cities_list):
    total_rec_cities = flatten_list(cities_list)
    cities_not_in_db = total_rec_cities - set(eu_cities)
    return cities_not_in_db

llama_db = cities_not_in_db(list(llama['rec_cities']))
llama_db_sus = cities_not_in_db(list(llama['rec_cities_sustainable']))

mistral_db = cities_not_in_db(list(mistral['rec_cities']))
mistral_db_sus = cities_not_in_db(list(mistral['rec_cities_sustainable']))

In [29]:
mistral_db_sus

set()

### Do both models recommend the same city? In how many cases do they differ?

In [52]:
sort_cities = lambda x: sorted(ast.literal_eval(x))

df = results_df.copy(deep=True)
df['rec_cities'] = df['rec_cities'].apply(sort_cities)
df['rec_cities_sustainable'] = df['rec_cities_sustainable'].apply(sort_cities)

def count_same_response(df, column):
# Pivot the DataFrame to compare values of C for different values of A
    pivot_table = df.pivot_table(index='prompt_id', columns='model', values=column, aggfunc=lambda x: x)
    # Count the number of records where the lists are the same across different values of A, considering list length
    # count_same_values = pivot_table.apply(lambda row: sum(
    #     len(row.iloc[0]) == len(x) and sorted(row.iloc[0]) == sorted(x) for x in row if pd.notna(x)), axis=1)
    
    count_same_values = pivot_table.apply(
        lambda row: sum(
            [sorted(row.iloc[0]) == sorted(x) for x in row if pd.notna(x) and len(row.iloc[0]) == len(x)]
        ), axis=1
    )


    print("Count of records where values of C are the same across different values of A:\n", count_same_values)
    return count_same_values


# count_rec = count_same_response(df, 'rec_cities')
# count_rec

In [55]:
pivot_table = df.pivot_table(index='prompt_id', columns='model', values='rec_cities', aggfunc=lambda x: x)
pivot_table

model,llama3point1-instruct,mistral-instruct
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1
prompt_0_gemini-1.5-pro-001,[Innsbruck],[Armenia]
prompt_0_gemini-ui,[Innsbruck],"[Bergen, Kars, Kaunas, Plovdiv]"
prompt_0_gpt-4o-mini,"[Greece, Kaunas, Strasbourg]",[Sibiu]
prompt_10_gemini-1.5-pro-001,[Sarajevo],"[Australia, Bolivia, Colombia, Greece, Luxembo..."
prompt_10_gemini-ui,"[Hyde Park, London]","[Antalya, Phuket, Sibiu]"
...,...,...
prompt_8_gemini-ui,[Milan],[Barcelona]
prompt_8_gpt-4o-mini,[Bergen],[]
prompt_9_gemini-1.5-pro-001,"[Budapest, Luxembourg, Mykolaiv, Nicosia, Orad...","[Amsterdam, Antwerp, Bucharest]"
prompt_9_gemini-ui,[Varna],"[Antalya, Burgas, Copenhagen, Greece, Mykolaiv]"


In [56]:
pivot_table_sus = df.pivot_table(index='prompt_id', columns='model', values='rec_cities_sustainable', aggfunc=lambda x: x)
pivot_table_sus

model,llama3point1-instruct,mistral-instruct
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1
prompt_0_gemini-1.5-pro-001,[Innsbruck],"[Bergen, Kaunas, Plovdiv]"
prompt_0_gemini-ui,[],"[Kars, Kaunas, Nalchik, Oslo, Plovdiv]"
prompt_0_gpt-4o-mini,"[Craiova, Gaziantep, Greece, Kaunas, Mykolaiv,...",[Sibiu]
prompt_10_gemini-1.5-pro-001,[Erzurum],"[Aalborg, Amsterdam, Barcelona, Berlin, Bogotá..."
prompt_10_gemini-ui,"[Islington, London]","[Albania, Antalya, Banja Luka, Belgrade, Budap..."
...,...,...
prompt_8_gemini-ui,[Milan],"[Baia Mare, Barcelona, Bratislava, Milan, Tall..."
prompt_8_gpt-4o-mini,[Nalchik],"[Bergen, Craiova, Maastricht, Plovdiv, Sibiu, ..."
prompt_9_gemini-1.5-pro-001,"[Budapest, Turku]","[Ankara, Australia, Barcelona, Beijing, Belgra..."
prompt_9_gemini-ui,"[Antalya, Burgas, Copenhagen]","[Amsterdam, Barcelona, Burgas, Copenhagen, Mun..."
