# Loading data and necessary files

In [26]:
from bert_score import BERTScorer
import numpy as np
import os
import pandas as pd
from src.config import DATA_DIR, TEMP_DIR, FNAMES_BERT_SCORES_FILE
from src.io import load_pickle, save_pickle
import time

# Global variables:
# name_df, gt_df, rec_df
# models

name_df = pd.read_csv(os.path.join(TEMP_DIR, 'UserStudy1', 'full_name_ref.csv'))
gt_df = pd.read_csv(os.path.join(TEMP_DIR, 'UserStudy1', 'user_ground_truth.csv'))
rec_df = pd.read_csv(os.path.join(TEMP_DIR, 'UserStudy1', 'user_recommendations.csv'))
rec_df = rec_df.rename(columns={"item_idx": "PID"})

gt_df = gt_df.sort_values(by=['UID', 'PID'])
rec_df = rec_df.sort_values(by=['UID', 'PID'])

gt_df = pd.merge(gt_df, name_df, how='left', on=['PID'])
gt_df = gt_df.rename(columns={"name": "gt_item"})
rec_df = pd.merge(rec_df, name_df, how='left', on=['PID'])
rec_df = rec_df.rename(columns={"name": "rec_item"})

models = sorted(set(rec_df['models'].values))

In [2]:
# Global variables:
# sim_dict
# scorer

if os.path.exists(FNAMES_BERT_SCORES_FILE):
    sim_dict = load_pickle(FNAMES_BERT_SCORES_FILE)
else:
    sim_dict = {}

loaded_sim_dict_len = len(sim_dict)
print('Loaded pre-computed BERTScores for tags, %s total records' % loaded_sim_dict_len)
    
import time
start = time.time()
scorer = BERTScorer(lang="en", rescale_with_baseline=True, device="cuda")
end = time.time()
print(end - start)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


19.566107749938965


In [3]:
print(models)
print(len(sim_dict))

['adaloyal', 'bpr', 'fpmc', 'global', 'lda', 'mixture', 'mixture_decay', 'nmf', 'personal', 'sasrec', 'wrmf']
869540


In [4]:
print(name_df.columns)
print(gt_df.columns)
print(rec_df.columns)

Index(['name', 'PID'], dtype='object')
Index(['UID', 'TS', 'PID', 'gt_item'], dtype='object')
Index(['UID', 'models', 'pred_rank', 'PID', 'is_positive', 'rec_item'], dtype='object')


In [5]:
name_df.nunique()

name    47789
PID     47789
dtype: int64

In [6]:
gt_df.nunique()

UID         6916
TS           143
PID        14532
gt_item    14532
dtype: int64

In [7]:
rec_df.nunique()

UID             6916
models            11
pred_rank         10
PID            47789
is_positive        2
rec_item       47789
dtype: int64

# Data Cleaning
Some models, i.e., adaloyal, fpmc, and personal, predicted more than 1 item for certain ranks for certain users (N=19526). In that case, we will randomly select one item to represet each nth-rank item.

In [8]:
print(rec_df.shape)
count_df = rec_df.groupby(['UID','models','pred_rank']).count().sort_values(by=['rec_item'], ascending=False)
count_df = count_df.reset_index()
count_df[count_df['PID']>1]

(838841, 6)


Unnamed: 0,UID,models,pred_rank,PID,is_positive,rec_item
0,230,fpmc,1,47789,47789,47789
1,4945,personal,4,94,94,94
2,4707,personal,4,74,74,74
3,4105,personal,3,71,71,71
4,4166,personal,7,63,63,63
...,...,...,...,...,...,...
19521,5882,adaloyal,9,2,2,2
19522,50,personal,5,2,2,2
19523,3405,personal,9,2,2,2
19524,104,personal,5,2,2,2


In [9]:
start = time.time()
rec_df = rec_df.groupby(['UID', 'models', 'pred_rank']).sample()
end = time.time()
print(rec_df.shape)
print('%.2f sec' % (end-start))

(730409, 6)
29.80 sec


# Scoring functions

In [10]:
def get_scores(single_cands, multi_refs):
    P_mul, R_mul, F_mul = scorer.score([single_cands], [multi_refs])
    return P_mul.item(), R_mul.item(), F_mul.item()

def scores(line, dict_lookup=True):
    if line['gt_item'].lower() == line['rec_item'].lower():
        return 1, 1, 1
    
    x, y = tuple(sorted([line['gt_item'].lower(), line['rec_item'].lower()]))
    
    if dict_lookup and len(sim_dict) > 0:
        if (x, y) in sim_dict:
            scores = sim_dict[(x, y)]
            return scores['P'], scores['R'], scores['F']

    P, R, F = get_scores(x, y)
    sim_dict[(x, y)] = {'P': P, 'R': R, 'F': F}
    return P, R, F

# Benchmarking

To compute BERTSCcores from scratch, it takes about (without GPU):
* 1 min for 580 pairs 
* 1 sec for 10 pairs
* 8 hours for 304,335 pairs (adaloyal)

With GPU:
* 2.5 min for about 9,000 pairs (200 users) or 1 hour for the whole data for one model (200K - 300K pairs)

# Main

Compute BERTScores for each model. Data will be split into batches so that results can be saved to CSV periodically.

In [201]:
start = time.time()
# 'adaloyal', 'bpr', 'fpmc', 'global', 'lda', 'mixture', 'mixture_decay', 'nmf', 'personal', 'sasrec', 'wrmf', 'random'
m = 'bpr'
start_uid = 0
batch_size = 200
max_uid = 7000

# We select each batch based on UID and models. batch_size=200 normally results in ~9000 rows of data for computing per batch.
while start_uid < max_uid:
    istart = time.time()
    # Selecting subset of data
    end_uid = start_uid + batch_size
    left_df = gt_df[(gt_df['UID']>=start_uid) & (gt_df['UID']<end_uid)]
    
    if m == 'random':
        pd.set_option('mode.chained_assignment', None) # suppress the chained_assignment warning
        rand_state = 100 # random state. 100 in the main experiment.
        right_df = rec_df[(rec_df['UID']>=start_uid) & (rec_df['UID']<end_uid) & (rec_df['models']=='global')] # use global as a template. But any model will do.
        right_df['rand_item'] = list(name_df['name'].sample(n = right_df.shape[0], random_state = rand_state)) # sample from all possible food items stored in name_df
        right_df = right_df.drop('rec_item', axis=1)
        right_df = right_df.rename(columns={'rand_item': 'rec_item'})
    else:
        right_df = rec_df[(rec_df['UID']>=start_uid) & (rec_df['UID']<end_uid) & (rec_df['models']==m)]
    
    if len(left_df) == 0 and len(right_df) == 0:
        break
    
    eval_df = pd.merge(left_df[['UID', 'gt_item']], right_df[['UID', 'rec_item',]], how='outer', on=['UID'])
    
    # Calculating BERTScores, will take quite some time if on a non-CUDA machine
    eval_df['P'], eval_df['R'], eval_df['F1'] = zip(*eval_df.apply(scores, axis=1))
    
    iend = time.time()
    eval_df.to_csv('data/temp/BERTScore/%s_uid_%s.csv' % (m, start_uid), index=False)
    print('total rows=%s\tstart_uid=%s\tend_uid=%s\telapsed_time=%.2f min' % (len(eval_df), start_uid, end_uid, (iend - istart)/60))
    start_uid += batch_size
    time.sleep(1)

end = time.time()
print('total_time=%.2f min' % ((end - start)/60))

total rows=8400	start_uid=0	end_uid=200	elapsed_time=0.60 min
total rows=8920	start_uid=200	end_uid=400	elapsed_time=0.53 min
total rows=8920	start_uid=400	end_uid=600	elapsed_time=0.57 min
total rows=9180	start_uid=600	end_uid=800	elapsed_time=0.63 min
total rows=8240	start_uid=800	end_uid=1000	elapsed_time=0.53 min
total rows=8950	start_uid=1000	end_uid=1200	elapsed_time=0.61 min
total rows=8660	start_uid=1200	end_uid=1400	elapsed_time=0.59 min
total rows=8680	start_uid=1400	end_uid=1600	elapsed_time=0.53 min
total rows=8710	start_uid=1600	end_uid=1800	elapsed_time=0.62 min
total rows=9090	start_uid=1800	end_uid=2000	elapsed_time=0.63 min
total rows=9510	start_uid=2000	end_uid=2200	elapsed_time=0.63 min
total rows=8000	start_uid=2200	end_uid=2400	elapsed_time=0.00 min
total rows=8520	start_uid=2400	end_uid=2600	elapsed_time=0.00 min
total rows=7890	start_uid=2600	end_uid=2800	elapsed_time=0.00 min
total rows=8260	start_uid=2800	end_uid=3000	elapsed_time=0.00 min
total rows=7860	start

Join all results into one file

In [202]:
# concatenating multiple results into one
m = 'bpr'
start_uid = 0
batch_size = 200
max_uid = 7000

big_df = pd.DataFrame()

while start_uid < max_uid:
    try:
        df = pd.read_csv('data/temp/BERTScore/%s_uid_%s.csv' % (m, start_uid))
    except FileNotFoundError:
        print('data/temp/BERTScore/%s_uid_%s.csv does not exist.' % (m, start_uid))
    
    big_df = pd.concat([big_df, df], ignore_index=True)
    start_uid += batch_size

big_df.to_csv('data/temp/BERTScore/%s.csv' % (m), index=False)

Computing aggregated top-k scores for each user and final scores for the model

In [203]:
start = time.time()
rec_max_df = big_df.groupby(['UID', 'rec_item'])[['P', 'R', 'F1']].apply(max)
user_mean_df = rec_max_df.reset_index().groupby(['UID'])[['P', 'R', 'F1']].mean()
end = time.time()
print('%.2f sec' % (end-start))
print(m)
print(user_mean_df.mean())

17.61 sec
bpr
P     0.243793
R     0.246838
F1    0.231384
dtype: float64


Persisting computed scores to a pickle file

In [204]:
print(len(sim_dict))
# for k, v in sim_dict.items():
#     print('%s\t%s\t%.2f\t%.2f\t%.2f' % (k[0], k[1], v['P'], v['R'], v['F']))

1479308


In [205]:
if len(sim_dict)>=loaded_sim_dict_len:
    save_pickle(FNAMES_BERT_SCORES_FILE, sim_dict)
    print('Saved pre-computed scores, %s total records' % len(sim_dict))