In [1]:
import pandas as pd 
import numpy as np 
import os 
import sys 
import re
import json 

In [2]:
METHODS = ['v', 'p0', 'p1']

In [3]:
gemini = pd.read_csv("../../data/conv-trs/eval/context_retrieval/gemini_retrieved_context.csv")
# llama = pd.read_csv("../../data/conv-trs/eval/context_retrieval/llama_retrieved_context.csv")

In [4]:
gemini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302 entries, 0 to 2301
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   config_id             2302 non-null   object
 1   original_context      2302 non-null   object
 2   cities                2302 non-null   object
 3   query_v               2302 non-null   object
 4   retrieved_cities_v    2302 non-null   object
 5   retrieved_context_v   2302 non-null   object
 6   query_p0              2302 non-null   object
 7   retrieved_cities_p0   2302 non-null   object
 8   retrieved_context_p0  2302 non-null   object
 9   query_p1              2302 non-null   object
 10  retrieved_cities_p1   2302 non-null   object
 11  retrieved_context_p1  2302 non-null   object
dtypes: object(12)
memory usage: 215.9+ KB


In [5]:
# llama.info()

In [6]:
def compute_recall(gt_cities, pred_cities):
    return len(set(gt_cities) & set(pred_cities))/len(set(gt_cities))

def compute_precision(gt_cities, pred_cities):
    return len(set(gt_cities) & set(pred_cities))/len(set(pred_cities))

In [7]:
def find_city_groundedness(df):
    recall = {
        'v': [],
        'p0': [],
        'p1': []
    }

    for index, row in df.iterrows():
        for method in METHODS: 
            recall[method].append(compute_recall(row['cities'], row[f'retrieved_cities_{method}']))

    for method in METHODS:
        print(f"Average recall for method {method}: {np.mean(recall[method])}")

# find_city_groundedness(llama)
find_city_groundedness(gemini)

Average recall for method v: 0.796691886431604
Average recall for method p0: 0.793619624594313
Average recall for method p1: 0.7862845148878322


In [8]:
from sentence_transformers import SentenceTransformer, util

In [9]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

In [10]:
def compute_similarity(df, col, ref_col):
    col_embeddings = model.encode(df[col].tolist(), convert_to_tensor=True)
    ref_embeddings = model.encode(df[ref_col].tolist(), convert_to_tensor=True)
    similarities = util.cos_sim(col_embeddings, ref_embeddings)
    return [similarities[i, i].item() for i in range(len(df))]

In [11]:
from evaluate import load
bertscore = load("bertscore")
# predictions = ["hello there. what's going on", "general kenobi"]
# references = ["hello there. my name is alice. what's up?", "general kenobi"]
# results = bertscore.compute(predictions=predictions, references=references, lang="en")

In [12]:
def compute_bert_score(df, ref, pred):
    references = df[ref]
    predictions = df[pred]  

    results = bertscore.compute(predictions=predictions, references=references, lang='en')
    return results 

In [13]:
def run(df):
    for method in METHODS:
        print(f"BERTScore Results for method {method}")
        res = compute_bert_score(df, 'original_context', f'retrieved_context_{method}')
        print(f"Average Precision: {np.mean(res['precision'])}")
        print(f"Average Recall: {np.mean(res['recall'])}")
        print(f"Average F1: {np.mean(res['f1'])}")
        print("-----------------------")

In [14]:
# run(llama)
run(gemini)

BERTScore Results for method v


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Precision: 0.7881949932341157
Average Recall: 0.8068414525712293
Average F1: 0.7971131069363976
-----------------------
BERTScore Results for method p0
Average Precision: 0.7860391132993143
Average Recall: 0.8038690504719131
Average F1: 0.7945302904791464
-----------------------
BERTScore Results for method p1
Average Precision: 0.7778071371966295
Average Recall: 0.7936860843910329
Average F1: 0.7852899350361862
-----------------------


In [40]:
# gemini_sim.to_csv("../../data/conv-trs/eval/sustainability/gemini_similarity.csv", index=False)
# llama_sim.to_csv("../../data/conv-trs/eval/sustainability/llama_similarity.csv", index=False)