In [51]:
import pandas as pd
import numpy as np

from scipy.stats import zscore
from sklearn.metrics import ndcg_score

from src.config import substitution_rating_file, preference_rating_file
from src.config import preference_rating_scores, preference_score_file

import warnings
warnings.filterwarnings('ignore')

In [52]:
all_metrics = ['BLEU-1', 'BLEU-2', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'P_BERT', 'R_BERT', 'F1_BERT', 
               'hP-1', 'hP-2', 'hP-freq', 'hR-1', 'hR-2', 'hR-freq', 
               'hP-Sim-1', 'hP-Sim-2', 'hP-Sim-freq', 'hR-Sim-1', 'hR-Sim-2', 'hR-Sim-freq']

rename_cols = {'user':'UID', 'qn':'model', 'item_1':'gt_item', 'choice':'rec_item'}

# Ground truth & predictions

In [53]:
cols = ['user', 'item_1']
df_gt = pd.read_csv(substitution_rating_file)[cols].drop_duplicates().reset_index(drop=True).rename(columns=rename_cols)
df_gt.head(2)

Unnamed: 0,UID,gt_item
0,1,Bottle Water - Water
1,1,Costco - Pepperoni Pizza


In [54]:
df_pred = pd.read_csv(preference_rating_file).rename(columns=rename_cols)
df_pred.head(2)

Unnamed: 0,UID,model,rec_item,rating
0,1,1,Jasmine Rice - Rice,4
1,1,1,Avocado,3


# BLEU: score for each recommendation
(similarity of recommended item and ground truth is not available, due to the brivity penalty in BLEU)
    
- df_bleu: user, model, rating, recommended item, BLEU-1, BLEU-2 

In [55]:
df_bleu = pd.read_csv(preference_rating_scores['BLEU']).rename(columns=rename_cols)
df_bleu.head(2)

Unnamed: 0,UID,model,rating,rec_item,BLEU-1,BLEU-2
0,1,1,4,Jasmine Rice - Rice,0.0,0.0
1,1,1,3,Avocado,0.0,0.0


# ROUGE - pairwise similarity

In [56]:
cols = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
df_rouge = pd.read_csv(preference_rating_scores['ROUGE']).rename(columns=rename_cols)
df_rouge.head(2)

Unnamed: 0,UID,model,rating,gt_item,rec_item,ROUGE-1,ROUGE-2,ROUGE-L
0,1,1,4,Bottle Water - Water,Jasmine Rice - Rice,0.0,0.0,0.0
1,1,1,3,Bottle Water - Water,Avocado,0.0,0.0,0.0


# BERTScore - pairwise similarity

In [57]:
cols = ['P_BERT', 'R_BERT', 'F1_BERT']
df_bertscores = pd.read_csv(preference_rating_scores['BERTScores']).rename(columns=rename_cols)
df_bertscores.head(2)

Unnamed: 0,UID,model,gt_item,rec_item,P_BERT,R_BERT,F1_BERT
0,1,1,Bottle Water - Water,Jasmine Rice - Rice,0.288776,0.288825,0.289956
1,1,1,Bottle Water - Water,Avocado,0.107143,-0.078615,0.014121


# hP, hP_sim: score for each recommendation

In [58]:
df_hp = pd.read_csv(preference_rating_scores['hP']).rename(columns=rename_cols)
df_hp.head(2)

Unnamed: 0,UID,model,rec_item,rating,hP-1,hP-2,hP-freq
0,1,1,Jasmine Rice - Rice,4,0.333333,0.333333,0.324324
1,1,1,Avocado,3,0.0,0.0,0.0


In [59]:
df_hpsim = pd.read_csv(preference_rating_scores['hP-Sim']).rename(columns=rename_cols)
df_hpsim.head(2)

Unnamed: 0,UID,model,rec_item,rating,hP-Sim-1,hP-Sim-2,hP-Sim-freq
0,1,1,Jasmine Rice - Rice,4,0.753388,0.753388,0.740395
1,1,1,Avocado,3,0.259877,0.259877,0.268447


# hR, hR_sim: score for each ground truth

In [60]:
df_hr = pd.read_csv(preference_rating_scores['hR']).rename(columns=rename_cols)
df_hr.head(2)

Unnamed: 0,UID,model,gt_item,hR-1,hR-2,hR-freq
0,1,1,Bottle Water - Water,0.0,0.0,0.0
1,1,2,Bottle Water - Water,0.0,0.0,0.0


In [61]:
df_hrsim = pd.read_csv(preference_rating_scores['hR-Sim']).rename(columns=rename_cols)
df_hrsim.head(2)

Unnamed: 0,UID,model,gt_item,hR-Sim-1,hR-Sim-2,hR-Sim-freq
0,1,1,Bottle Water - Water,0.0,0.0,0.0
1,1,2,Bottle Water - Water,0.0,0.0,0.0


# Debugging

# User-model rating (240 rows)

In [65]:
import time
start = time.time()
# @10 scores
cols1 = ['UID', 'model', 'rec_item']
cols2 = ['UID', 'model']

cols = ['BLEU-1', 'BLEU-2']
bleu_score = df_bleu.groupby(cols2)[cols].apply(np.mean)

cols = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
rouge_score = df_rouge.groupby(cols1)[cols].apply(max)
rouge_score2 = rouge_score.reset_index().groupby(cols2)[cols].apply(np.mean)

cols = ['P_BERT', 'R_BERT', 'F1_BERT']
bert_score = df_bertscores.groupby(cols1)[cols].apply(max)
bert_score2 = bert_score.reset_index().groupby(cols2)[cols].apply(np.mean)

hP = df_hp.groupby(cols2)['hP-1', 'hP-2', 'hP-freq'].mean()
hR = df_hr.groupby(cols2)['hR-1', 'hR-2', 'hR-freq'].mean()
hPSim = df_hpsim.groupby(cols2)['hP-Sim-1', 'hP-Sim-2', 'hP-Sim-freq'].mean()
hRSim = df_hrsim.groupby(cols2)['hR-Sim-1', 'hR-Sim-2', 'hR-Sim-freq'].mean()

# normalize ratings
dfs = []
for u, df_temp in df_pred.groupby('UID'):
    df_temp['rating_z'] = zscore(df_temp['rating'])
    dfs.append(df_temp)
df_z = pd.concat(dfs)
dr = df_z.groupby(cols2)[['rating_z', 'rating']].mean()

h = pd.concat([dr, bleu_score, rouge_score2, bert_score2, hP, hR, hPSim, hRSim], axis=1)
h = h.rename(columns={c:c+'@10' for c in all_metrics})
h = h.reset_index()
end = time.time()
print(end-start)

1.8012669086456299


# With standard metrics

In [66]:
import time
start = time.time()

def basket_ndcg(user, qn):
    r = df_pred[(df_pred['UID']==user) & (df_pred['model']==qn)]['rec_item'].to_numpy()
    g = df_gt[df_gt['UID']==user]['gt_item'].to_numpy()

    g0 = np.zeros(len(r))

    for i, k in enumerate(r):
        if k in g:
            g0[i] = 1
            
    scores, true_relevance = np.asarray([np.arange(1,0, -0.1)]), np.asarray([g0])
    return ndcg_score(true_relevance, scores)

def basket_precision(user, qn):
    r = df_pred[(df_pred['UID']==user) & (df_pred['model']==qn)]['rec_item'].to_numpy()
    g = df_gt[df_gt['UID']==user]['gt_item'].to_numpy()

    precision = len(set(r) & set(g)) / len(r)
    return precision

def basket_recall(user, qn):
    r = df_pred[(df_pred['UID']==user) & (df_pred['model']==qn)]['rec_item'].to_numpy()
    g = df_gt[df_gt['UID']==user]['gt_item'].to_numpy()

    recall = len(set(r) & set(g)) / len(g)
    return recall

def find_basket_ndcg(line):
    return basket_ndcg(line['UID'],line['model'])
def find_basket_precision(line):
    return basket_precision(line['UID'],line['model'])
def find_basket_recall(line):
    return basket_recall(line['UID'],line['model'])

h['ndcg@10'] = h.apply(find_basket_ndcg, axis=1)
h['p@10'] = h.apply(find_basket_precision, axis=1)
h['r@10'] = h.apply(find_basket_recall, axis=1)
end = time.time()
print(end-start)

0.6073765754699707


In [67]:
h.to_csv(preference_score_file, index=False)

In [68]:
h.head(2)

Unnamed: 0,UID,model,rating_z,rating,BLEU-1@10,BLEU-2@10,ROUGE-1@10,ROUGE-2@10,ROUGE-L@10,P_BERT@10,...,hR-freq@10,hP-Sim-1@10,hP-Sim-2@10,hP-Sim-freq@10,hR-Sim-1@10,hR-Sim-2@10,hR-Sim-freq@10,ndcg@10,p@10,r@10
0,1,1,0.219071,3.2,0.0,0.0,0.0,0.0,0.0,0.203294,...,0.211301,0.523157,0.524652,0.506372,0.456033,0.457103,0.478065,0.0,0.0,0.0
1,1,2,-0.719805,2.0,0.0,0.0,0.0,0.0,0.0,0.115581,...,0.286506,0.491069,0.495886,0.433968,0.475617,0.478712,0.498861,0.0,0.0,0.0


In [69]:
h.corr().iloc[2:, 2:4]

Unnamed: 0,rating_z,rating
rating_z,1.0,0.832509
rating,0.832509,1.0
BLEU-1@10,0.435927,0.456335
BLEU-2@10,0.447448,0.457125
ROUGE-1@10,0.428143,0.445913
ROUGE-2@10,0.445472,0.454499
ROUGE-L@10,0.431372,0.448129
P_BERT@10,0.5138,0.512898
R_BERT@10,0.469538,0.463246
F1_BERT@10,0.503527,0.501921
