In [1]:
import pandas as pd 
import numpy as np 
import os 
import sys 
import re
import json 

In [2]:
gemini = pd.read_json("../../data/conv-trs/llm-results/Gemini1Point5Pro_generated_queries.json")
llama = pd.read_json("../../data/conv-trs/llm-results/Llama3Point2Vision90B_generated_parsed_queries.json")

In [3]:
sus_gemini = gemini[gemini['config_id'].str.contains("sustainable")]
sus_llama = llama[llama['config_id'].str.contains("sustainable")]

In [4]:
sus_gemini.head()

Unnamed: 0,config_id,config,context,city,query_v,query_p0,query_p1
3,c_p_0_pop_low_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Craiova has low popularity , low budget, and g...","[Craiova, Gaziantep]",Low budget European city break in April with g...,"Off-season European city break, low budget, gr...",Best European cities for hockey fans with grea...
7,c_p_0_pop_medium_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Aalborg has medium popularity , high budget, a...","[Aalborg, Astrakhan, Bari, Bremen, Cheboksary,...",Suggest a highly walkable European city for a ...,"European city break in July, walkable, high-en...",Best European cities for hockey fans with vibr...
11,c_p_0_pop_high_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Zurich has high popularity , high budget, and ...","[Zurich, Warsaw, Vienna, Valencia, Toulouse, S...",Walkable European city with vibrant arts and c...,"European city break: high budget, walkable, vi...",Best European cities for hockey fans with grea...
15,c_p_1_pop_low_sustainable,"{'p_id': 'p_1', 'persona': 'A former DJ at WSU...","Adana has low popularity , low budget, and gre...","[Adana, Erzurum, Kayseri, Konya, Rivne, Sivas]",Suggest some off-the-beaten-path European citi...,"Best European cities for a budget-friendly, wa...","Best European cities for live music, especiall..."
19,c_p_1_pop_medium_sustainable,"{'p_id': 'p_1', 'persona': 'A former DJ at WSU...","Braga has medium popularity , medium budget, a...","[Braga, Brno, Coimbra]",Great walkable European city with medium budge...,Great walkable European city with a vibrant mu...,"Best European cities for live music, vibrant n..."


In [5]:
sus_features = {
        "seasonality": ("low", "low seasonality"),
        "walkability": ("great", "great walkability"),
        "aqi": ("great", "great air quality")
    }

In [6]:
def find_sus_feature(config):
    for key in config['filters'].keys():
        if key not in sus_features:
            continue
        return f"Looking for cities with {sus_features[key][1]}"
    
sus_gemini['sus_feature'] = sus_gemini['config'].apply(find_sus_feature)
sus_llama['sus_feature'] = sus_llama['config'].apply(find_sus_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_gemini['sus_feature'] = sus_gemini['config'].apply(find_sus_feature)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_llama['sus_feature'] = sus_llama['config'].apply(find_sus_feature)


In [7]:
sus_llama.head()

Unnamed: 0,config_id,config,context,city,query_v,query_p0,query_p1,sus_feature
3,c_p_0_pop_low_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Craiova has low popularity , low budget, and g...","[Craiova, Gaziantep]","""Low budget destinations in Europe with good a...","""Low budget European cities with great air qua...",Which European cities have ice hockey arenas a...,Looking for cities with great air quality
7,c_p_0_pop_medium_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Aalborg has medium popularity , high budget, a...","[Aalborg, Astrakhan, Bari, Bremen, Cheboksary,...","""European cities with great walkability and hi...","""European cities with great walkability for a ...",Which European cities have ice hockey arenas a...,Looking for cities with great walkability
11,c_p_0_pop_high_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Zurich has high popularity , high budget, and ...","[Zurich, Warsaw, Vienna, Valencia, Toulouse, S...","""Recommend European cities with great walkabil...","""European cities with great walkability and hi...",Which European cities have ice hockey arenas a...,Looking for cities with great walkability
15,c_p_1_pop_low_sustainable,"{'p_id': 'p_1', 'persona': 'A former DJ at WSU...","Adana has low popularity , low budget, and gre...","[Adana, Erzurum, Kayseri, Konya, Rivne, Sivas]","""Low budget European cities with great walkabi...","""Affordable European cities with great walkabi...",Which European cities offer a mix of music fes...,Looking for cities with great walkability
19,c_p_1_pop_medium_sustainable,"{'p_id': 'p_1', 'persona': 'A former DJ at WSU...","Braga has medium popularity , medium budget, a...","[Braga, Brno, Coimbra]","""Medium budget European cities with great walk...","""Medium budget, walkable European cities with ...",Which European cities offer a mix of music fes...,Looking for cities with great walkability


In [8]:
from sentence_transformers import SentenceTransformer, util

In [9]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

In [10]:
def compute_similarity(df, col, ref_col):
    col_embeddings = model.encode(df[col].tolist(), convert_to_tensor=True)
    ref_embeddings = model.encode(df[ref_col].tolist(), convert_to_tensor=True)
    similarities = util.cos_sim(col_embeddings, ref_embeddings)
    return [similarities[i, i].item() for i in range(len(df))]

In [11]:
for method in ['v', 'p0', 'p1']:
    sus_llama[f"sim_{method}"] = compute_similarity(sus_llama, f"query_{method}", "sus_feature")
    sus_gemini[f"sim_{method}"] = compute_similarity(sus_gemini, f"query_{method}", "sus_feature")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_llama[f"sim_{method}"] = compute_similarity(sus_llama, f"query_{method}", "sus_feature")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_gemini[f"sim_{method}"] = compute_similarity(sus_gemini, f"query_{method}", "sus_feature")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_llama[f"sim_{

In [12]:
for method in ['v', 'p0', 'p1']:
    print(f"[Llama] Average similarity score with the sustainability feature for query_{method}: {round(sus_llama[f'sim_{method}'].mean(), 3)}")
    print(f"[Gemini] Average similarity score with the sustainability feature for query_{method}: {round(sus_gemini[f'sim_{method}'].mean(), 3)}")

[Llama] Average similarity score with the sustainability feature for query_v: 0.52
[Gemini] Average similarity score with the sustainability feature for query_v: 0.524
[Llama] Average similarity score with the sustainability feature for query_p0: 0.465
[Gemini] Average similarity score with the sustainability feature for query_p0: 0.49
[Llama] Average similarity score with the sustainability feature for query_p1: 0.391
[Gemini] Average similarity score with the sustainability feature for query_p1: 0.386


[Llama] 
- Average similarity score with the sustainability feature for query_v: 0.52
- Average similarity score with the sustainability feature for query_p0: 0.465
- Average similarity score with the sustainability feature for query_p1: 0.391

[Gemini] 
- Average similarity score with the sustainability feature for query_v: 0.524
- Average similarity score with the sustainability feature for query_p0: 0.49
- Average similarity score with the sustainability feature for query_p1: 0.386

In [26]:
key_templates = {
            'popularity': "Looking for {value} popularity or {value} touristy destinations",
            'budget': "Looking for {value} budget options",
            'interests': "Interested in {value} activities",
            'month': "Suggest some cities to visit in {value}.",
            }

In [27]:
def calculate_similarity(query, preferences):
    pref_statements = [key_templates[pref].format(value=value) for pref,value in preferences.items()]
    query_embedding = model.encode(query, convert_to_tensor=True)
    pref_embeddings = model.encode(pref_statements, convert_to_tensor=True)
    
    similarities = util.cos_sim(query_embedding, pref_embeddings)[0]
    return {k: float(v) for k, v in zip(preferences.keys(), similarities)}

In [28]:
def get_non_sus_sim(df):
    non_sus_sim = {
        'v': [],
        'p0': [],
        'p1': [],
    }

    for i, row in df.iterrows():
        preferences = row['config']['filters']
        for key in sus_features.keys():
            if key in preferences: 
                del preferences[key]

        for method in ['v', 'p0', 'p1']:
            result = calculate_similarity(row[f'query_{method}'], preferences)
            avg = sum(result.values())/len(result)

            non_sus_sim[method].append(avg)
    
    for method in ['v', 'p0', 'p1']:
        df[f'nonsus_sim_{method}'] = non_sus_sim[method]
    
    return df

In [29]:
llama_sim = get_non_sus_sim(sus_llama)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'nonsus_sim_{method}'] = non_sus_sim[method]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'nonsus_sim_{method}'] = non_sus_sim[method]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'nonsus_sim_{method}'] = non_sus_sim[method]


In [30]:
gemini_sim  = get_non_sus_sim(sus_gemini)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'nonsus_sim_{method}'] = non_sus_sim[method]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'nonsus_sim_{method}'] = non_sus_sim[method]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'nonsus_sim_{method}'] = non_sus_sim[method]


In [36]:
from sklearn.metrics import mean_absolute_error

def print_mae(df):  
    for method in ['v', 'p0', 'p1']:
        print(f"Mean Absolute Error for Q_{method}: {round(mean_absolute_error(df[f'sim_{method}'], df[f'nonsus_sim_{method}']), 3)}")

In [37]:
print("MAE Scores for LLama")
print_mae(llama_sim)

MAE Scores for LLama
Mean Absolute Error for Q_v: 0.111
Mean Absolute Error for Q_p0: 0.107
Mean Absolute Error for Q_p1: 0.068


In [38]:
print("MAE Scores for Gemini")
print_mae(gemini_sim)

MAE Scores for Gemini
Mean Absolute Error for Q_v: 0.109
Mean Absolute Error for Q_p0: 0.111
Mean Absolute Error for Q_p1: 0.082


In [39]:
llama_sim.head()

Unnamed: 0,config_id,config,context,city,query_v,query_p0,query_p1,sus_feature,sim_v,sim_p0,sim_p1,nonsus_sim_v,nonsus_sim_p0,nonsus_sim_p1
3,c_p_0_pop_low_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Craiova has low popularity , low budget, and g...","[Craiova, Gaziantep]","""Low budget destinations in Europe with good a...","""Low budget European cities with great air qua...",Which European cities have ice hockey arenas a...,Looking for cities with great air quality,0.583023,0.577222,0.458836,0.474281,0.38499,0.360006
7,c_p_0_pop_medium_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Aalborg has medium popularity , high budget, a...","[Aalborg, Astrakhan, Bari, Bremen, Cheboksary,...","""European cities with great walkability and hi...","""European cities with great walkability for a ...",Which European cities have ice hockey arenas a...,Looking for cities with great walkability,0.592079,0.581419,0.4243,0.50335,0.352603,0.356358
11,c_p_0_pop_high_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Zurich has high popularity , high budget, and ...","[Zurich, Warsaw, Vienna, Valencia, Toulouse, S...","""Recommend European cities with great walkabil...","""European cities with great walkability and hi...",Which European cities have ice hockey arenas a...,Looking for cities with great walkability,0.594798,0.490028,0.4243,0.450277,0.34509,0.332823
15,c_p_1_pop_low_sustainable,"{'p_id': 'p_1', 'persona': 'A former DJ at WSU...","Adana has low popularity , low budget, and gre...","[Adana, Erzurum, Kayseri, Konya, Rivne, Sivas]","""Low budget European cities with great walkabi...","""Affordable European cities with great walkabi...",Which European cities offer a mix of music fes...,Looking for cities with great walkability,0.628178,0.566541,0.363453,0.507183,0.425694,0.352346
19,c_p_1_pop_medium_sustainable,"{'p_id': 'p_1', 'persona': 'A former DJ at WSU...","Braga has medium popularity , medium budget, a...","[Braga, Brno, Coimbra]","""Medium budget European cities with great walk...","""Medium budget, walkable European cities with ...",Which European cities offer a mix of music fes...,Looking for cities with great walkability,0.635479,0.474711,0.363453,0.475159,0.453127,0.346198


In [40]:
gemini_sim.to_csv("../../data/conv-trs/eval/sustainability/gemini_similarity.csv", index=False)
llama_sim.to_csv("../../data/conv-trs/eval/sustainability/llama_similarity.csv", index=False)