In [30]:
import pandas as pd 
import numpy as np 
import os 
import re
import sys 
import ast 

In [31]:
results_dir = "../../../european-city-data/rag-sustainability/results/results-combined_prompts/"
folders = os.listdir(results_dir)
results_df = pd.read_csv(f"{results_dir}/recommended_cities.csv")

cities_df = pd.read_csv("../../../european-city-data/cities/worldcities.csv")
cities = list(cities_df['city'])

eucities_df = pd.read_csv("../../../european-city-data/archive/city_abstracts_embeddings.csv")
eu_cities = list(cities_df['city'])

In [32]:
results_df.head()

Unnamed: 0,model,prompt_id,rec_cities,response,rec_cities_sustainable,response_sustainable
0,llama3point1-instruct,prompt_17_gemini-ui,['Kaunas'],I recommend Kaunas in Lithuania for your expe...,['Vitoria-Gasteiz'],I recommend Vitoria-Gasteiz because it is a c...
1,llama3point1-instruct,prompt_27_gemini-1.5-pro-001,['Copenhagen'],"I recommend Copenhagen, Denmark because it is...",['Copenhagen'],I recommend Copenhagen and why I recommended ...
2,llama3point1-instruct,prompt_42_gpt-4o-mini,['Amsterdam'],"I recommend Amsterdam, Netherlands. Amsterdam...","['Nalchik', 'Varna', 'Kahramanmaras']","I recommend Nalchik, Russia because of its be..."
3,llama3point1-instruct,prompt_0_gpt-4o-mini,"['Kaunas', 'Thessaloniki', 'Strasbourg']","I recommend Kaunas, Lithuania. I recommend Ka...","['Kaunas', 'Oradea', 'Strasbourg', 'Thessaloni...","I recommend Kaunas, Lithuania because it is a..."
4,llama3point1-instruct,prompt_44_gemini-1.5-pro-001,['Kaunas'],I recommend Kaunas for your medieval history ...,['Kaunas'],"I recommend Kaunas, Lithuania because it has ..."


In [33]:
# Compute the average number of cities from our database that have been recommended by each model 

llama = results_df[results_df['model'] == 'llama3point1-instruct']
mistral = results_df[results_df['model'] == 'mistral-instruct']

### How many cities does each model recommend on average? 

For reference: the prompt asked each model to recommend 3 cities to the user 

In [34]:
# Average number of cities in context per prompt - Llama

llama_avg_cities = np.mean(llama['rec_cities'].apply(lambda x: len(ast.literal_eval(x))))
print(llama_avg_cities)

llama_avg_cities_sus = np.mean(llama['rec_cities_sustainable'].apply(lambda x: len(ast.literal_eval(x))))
print(llama_avg_cities_sus)



2.825
2.74


In [35]:
# Average number of cities in context per prompt - Mistral

mistral_avg_cities = np.mean(mistral['rec_cities'].apply(lambda x: len(ast.literal_eval(x))))
print(mistral_avg_cities)

mistral_avg_cities_sus = np.mean(mistral['rec_cities_sustainable'].apply(lambda x: len(ast.literal_eval(x))))
print(mistral_avg_cities_sus)

2.58
3.04


Llama 3.1. shows a slightly better performance as opposed to Mistral but both are below the expected requirement of 3 recommended cities per prompt

### How many out-of-context cities are recommended to the user? 

How to compute this without ambiguities???

In [36]:
def avg_city_not_in_db(rec_cities, ref_cities = eu_cities):
    size = len(ast.literal_eval(rec_cities))
    if size == 0: 
        # return 0
        return []
    
    cities_in_db = [city for city in ast.literal_eval(rec_cities) if city not in ref_cities]
    # cities_in_db = sum(1 for city in ast.literal_eval(rec_cities) if city not in ref_cities)
    return cities_in_db



# # avg_llama = llama['rec_cities'].apply(avg_city_not_in_db)
# # avg_llama_sus = llama['rec_cities_sustainable'].apply(avg_city_not_in_db)

# avg_mistral = mistral['rec_cities'].apply(avg_city_not_in_db)
# avg_mistral_sus = mistral['rec_cities_sustainable'].apply(avg_city_not_in_db)

# # avg_table = np.array([avg_llama, avg_llama_sus, avg_mistral, avg_mistral_sus]).reshape(2, 2)
# # print(avg_table)

# print(avg_mistral)



In [37]:
def flatten_list(cities_list):
     return set([city for cities in cities_list for city in ast.literal_eval(cities)])

def cities_not_in_db(cities_list):
    total_rec_cities = flatten_list(cities_list)
    cities_not_in_db = total_rec_cities - set(eu_cities)
    return cities_not_in_db

# llama_db = cities_not_in_db(list(llama['rec_cities']))
# llama_db_sus = cities_not_in_db(list(llama['rec_cities_sustainable']))

# mistral_db = cities_not_in_db(list(mistral['rec_cities']))
# mistral_db_sus = cities_not_in_db(list(mistral['rec_cities_sustainable']))

### Count of empty lists for cities => none of the retrieved cities from the context were recommended => model has hallucinated

In [38]:
count_empty = lambda x: 1 if len(ast.literal_eval(x)) == 0 else 0

mistral_empty = sum(mistral['rec_cities'].apply(count_empty))
mistral_empty_sus = sum(mistral['rec_cities_sustainable'].apply(count_empty))

llama_empty = sum(llama['rec_cities'].apply(count_empty))
llama_empty_sus = sum(llama['rec_cities_sustainable'].apply(count_empty))

In [39]:
table_empty_count = np.array([llama_empty, llama_empty_sus, mistral_empty, mistral_empty_sus]).reshape(2, 2)
table_empty_count

array([[ 1,  2],
       [28, 18]])

Mistral is far worse than Llama in this regard and hallucinates in 76/200 prompts without sustainability and 57/200 prompts with sustainability, where as more than 90-95% of the prompt responses in Llama include atleast 1 city from the retrieved context.

While hallucination comes down in Mistral when sustainability is added, it seems to increase by 81% in Llama - what could be a possible reason??