In [1]:
import sklearn
import pandas as pd 
import numpy as np 
import os 
import re
import sys 
import ast 

In [16]:
results_dir = "../../../european-city-data/rag-sustainability/results/results-combined_prompts/"
folders = os.listdir(results_dir)

sar_results_dir = "../../../european-city-data/rag-sustainability/results/results-combined_prompts_SAR/"
sar_results = pd.read_csv(f"{sar_results_dir}/recommended_cities_sar.csv")
sar_folders = os.listdir(sar_results_dir)

In [4]:
from nltk.corpus import stopwords
import nltk

In [5]:
# nltk.download('stopwords')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# can we just stick to Tf-Idf to vectorize, or do we want to use embeddings - static (Word2Vec) or contextual (BERT)? 

def preprocess(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    # Rejoin the tokens into a string
    return ' '.join(tokens)

def compute_similarity(paths):

    documents = [open(f).read() for f in paths]
    preprocessed_docs = [preprocess(text) for text in documents]
    tfidf = TfidfVectorizer().fit_transform(preprocessed_docs)
    # no need to normalize, since Vectorizer will return normalized tf-idf
    # pairwise_similarity = tfidf * tfidf.T
    pairwise_similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])

    return round(pairwise_similarity[0][0], 3)

# corpus = ["I'd like an apple", "An apple a day keeps the doctor away",]
# tfidf = TfidfVectorizer().fit_transform(corpus)
#     # no need to normalize, since Vectorizer will return normalized tf-idf
# pairwise_similarity = tfidf * tfidf.T
# print(round(pairwise_similarity[0,1], 3))

In [12]:
def get_sim_scores(folders, results_dir, sar=0):
    sim_results = []
    for model in folders:
        if ".csv" in model or "judge" in model: 
            continue 

        for prompt in os.listdir(os.path.join(results_dir, model)):
            if not sar: 
                non_sus_paths = [os.path.join(results_dir, model, prompt, "response.txt"), os.path.join(results_dir, model, prompt, "context.txt")]
                sus_paths = [os.path.join(results_dir, model, prompt, "response_sustainable.txt"), os.path.join(results_dir, model, prompt, "context_sustainable.txt")]

                sim_score = compute_similarity(non_sus_paths)
                sim_score_sustainable = compute_similarity(sus_paths)

                sim_results.append({
                    'model': model, 
                    'prompt_id': prompt,
                    'context_response_sim': sim_score, 
                    'context_response_sim_sustainable': sim_score_sustainable,
                })

            if sar:
                sar_paths = [os.path.join(results_dir, model, prompt, "response_sustainable.txt"), os.path.join(results_dir, model, prompt, "context_sustainable.txt")]
                sim_score_sustainable = compute_similarity(sar_paths)

                sim_results.append({
                    'model': model, 
                    'prompt_id': prompt,
                    'context_response_sim_sar': sim_score_sustainable,
                })

    sim_results_df = pd.DataFrame(sim_results)
    return sim_results_df

In [13]:
sim_sar = get_sim_scores(sar_folders, sar_results_dir, sar=1)

In [14]:
sim_sar.head()

Unnamed: 0,model,prompt_id,context_response_sim_sar
0,llama3point1-instruct,prompt_17_gemini-ui,0.153
1,llama3point1-instruct,prompt_27_gemini-1.5-pro-001,0.008
2,llama3point1-instruct,prompt_42_gpt-4o-mini,0.077
3,llama3point1-instruct,prompt_0_gpt-4o-mini,0.224
4,llama3point1-instruct,prompt_44_gemini-1.5-pro-001,0.105


In [15]:
sim_sar.to_csv(f"{sar_results_dir}/context_response_similarity_scores.csv", index=False)