The code in this notebook is based off the two notebooks: 
* 4-7-RQ4-BERTScores
* 4-4-UserStudy2_preference_hierarchical_sim

# Loading data and necessary files

In [1]:
from bert_score import BERTScorer
from collections import defaultdict
import itertools
import numpy as np
import os
import pandas as pd
import re
from src.config import DATA_DIR, TEMP_DIR
from src.config import label_file, label_file_full, weighing_scheme
from src.config import BERT_F1_dict_file
from src.io import load_pickle, save_pickle
import time

# Global variables:
# name_df, gt_df, rec_df
# models

name_df = pd.read_csv(os.path.join(TEMP_DIR, 'UserStudy1', 'full_name_ref.csv'))
gt_df = pd.read_csv(os.path.join(TEMP_DIR, 'UserStudy1', 'user_ground_truth.csv')).drop_duplicates().reset_index(drop=True)
rec_df = pd.read_csv(os.path.join(TEMP_DIR, 'UserStudy1', 'user_recommendations.csv'))
rec_df = rec_df.rename(columns={"item_idx": "PID"})

gt_df = gt_df.sort_values(by=['UID', 'PID'])
rec_df = rec_df.sort_values(by=['UID', 'PID'])

gt_df = pd.merge(gt_df, name_df, how='left', on=['PID'])
gt_df = gt_df.rename(columns={"name": "gt_item"})
rec_df = pd.merge(rec_df, name_df, how='left', on=['PID'])
rec_df = rec_df.rename(columns={"name": "rec_item"})

models = sorted(set(rec_df['models'].values))

In [2]:
# Global variables:
# sim_dict
# scorer

if os.path.exists(BERT_F1_dict_file):
    sim_dict = load_pickle(BERT_F1_dict_file)
else:
    sim_dict = {}

loaded_sim_dict_len = len(sim_dict)
print('Loaded pre-computed BERTScores for tags, %s total records' % loaded_sim_dict_len)

import time
start = time.time()
scorer = BERTScorer(lang="en", rescale_with_baseline=True, device="cuda")
end = time.time()
print(end - start)

Loaded pre-computed BERTScores for tags, 227864 total records


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


17.33212685585022


In [3]:
print(models)

['adaloyal', 'bpr', 'fpmc', 'global', 'lda', 'mixture', 'mixture_decay', 'nmf', 'personal', 'sasrec', 'wrmf']


In [4]:
print(name_df.columns)
print(gt_df.columns)
print(rec_df.columns)

Index(['name', 'PID'], dtype='object')
Index(['UID', 'TS', 'PID', 'gt_item'], dtype='object')
Index(['UID', 'models', 'pred_rank', 'PID', 'is_positive', 'rec_item'], dtype='object')


In [5]:
name_df.nunique()

name    47789
PID     47789
dtype: int64

In [6]:
gt_df.nunique()

UID         6916
TS           143
PID        14532
gt_item    14532
dtype: int64

In [7]:
rec_df.nunique()

UID             6916
models            11
pred_rank         10
PID            47789
is_positive        2
rec_item       47789
dtype: int64

# Data Cleaning
Some models, i.e., adaloyal, fpmc, and personal, predicted more than 1 item for certain ranks for certain users (N=19526). In that case, we will randomly select one item to represet each nth-rank item.

In [8]:
print(rec_df.shape)
count_df = rec_df.groupby(['UID','models','pred_rank']).count().sort_values(by=['rec_item'], ascending=False)
count_df = count_df.reset_index()
count_df[count_df['PID']>1]

(838841, 6)


Unnamed: 0,UID,models,pred_rank,PID,is_positive,rec_item
0,230,fpmc,1,47789,47789,47789
1,4945,personal,4,94,94,94
2,4707,personal,4,74,74,74
3,4105,personal,3,71,71,71
4,4166,personal,7,63,63,63
...,...,...,...,...,...,...
19521,5882,adaloyal,9,2,2,2
19522,50,personal,5,2,2,2
19523,3405,personal,9,2,2,2
19524,104,personal,5,2,2,2


In [9]:
start = time.time()
rec_df = rec_df.groupby(['UID', 'models', 'pred_rank']).sample()
end = time.time()
print(rec_df.shape)
print('%.2f sec' % (end-start))

(730409, 6)
28.62 sec


In [10]:
rand_state = 100 # random state. 100 in the main experiment.
rand_df = rec_df[rec_df['models']=='global'] # use global as a template. But any model will do.
rand_df['models'] = 'random'
rand_df['rand_item'] = list(name_df['name'].sample(n = rand_df.shape[0], replace=True, random_state = rand_state)) # sample from all possible food items stored in name_df
rand_df = rand_df.drop('rec_item', axis=1)
rand_df = rand_df.rename(columns={'rand_item': 'rec_item'})
rec_df = pd.concat([rec_df, rand_df], ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rand_df['models'] = 'random'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rand_df['rand_item'] = list(name_df['name'].sample(n = rand_df.shape[0], replace=True, random_state = rand_state)) # sample from all possible food items stored in name_df


In [11]:
def process_food_name(s1):
    # separators: ", " + any of (integer, decimal & fraction) +" "
    exp = r", \d+\.\d+ |, \d+\,\d+ |, \d+ |, \d+\/\d+ "
    # remove content in parenthesis for finding the separator
    if s1.count('(') == s1.count(')'):
        s2 = re.sub(r'[(].*?[\)]', ' ', s1)
    else:
        s2 = s1
    try:
        split_by = re.findall(exp, s2)[0]
        return clean_name(s1.split(split_by)[0])
    except:
        return clean_name(s2)

def clean_name(name):
    name = name.replace("\t", " ").replace("\n", " ").replace("w/o", " no ").replace("w/", " ")
    return re.sub(' +', ' ', name.strip()).lower()

Process all food names by removing puctuations and numerals

Mapping between variable names (this notebook's, the reference notebook's):

For (gt_df, df2):
* (UID, user)
* (gt_item, item_1)
* (gt_item_cleaned, item_gt)

For (rec_df, df):
* (UID, user)
* (rec_item, choice)
* (rec_item_cleaned, item)
* (models, qn)

In [12]:
rec_df['rec_item_cleaned'] = rec_df['rec_item'].apply(process_food_name)
rec_df = rec_df.rename(columns={'UID': 'user', 'rec_item': 'choice', 'rec_item_cleaned': 'item', 'models': 'qn'})
gt_df['gt_item_cleaned'] = gt_df['gt_item'].apply(process_food_name)
gt_df = gt_df.rename(columns={'UID': 'user', 'gt_item': 'item_1', 'gt_item_cleaned': 'item_gt'})

In [13]:
labels = load_pickle(label_file)
concat_list = list(zip(labels['cat_info'], labels['label_summary']))
label_index = dict()
for i,j in concat_list:
    label_index.update(zip(i,j))  

index_label = {k:v for v,k in label_index.items()}

# '_' connected tokens for l0 tags
label_index_l0 = {i.split('__')[-1]:v for i,v in label_index.items() if len(i.split('__'))==3}
l0_tags = sorted(label_index_l0.keys())

label_with_food = defaultdict(list)
def group_food_name(line):
    s = line['food_name']
    for i in line['label_summary']:
        label_with_food[i].append(s)
labels.apply(group_food_name, axis=1) 

label_name_l0 = {}
for l,i in label_index_l0.items():
    label_name_l0[l] = label_with_food[i]

name_labels = labels.set_index('food_name').to_dict()['cat_info']
name_label_0 = {k:[s.split('__')[-1] for s in v if len(s.split('__'))==3] for k,v in name_labels.items()}

matched_label_l0 = []
unmatched_label_l0 = []

def token_transform(t):
    tokens = [t, t+'s', t+'es']
    if t[-1] == 'y':
        tokens.append(t[:-1]+'ies')
    return tokens  

for l, s_lst in label_name_l0.items():    
    l_primes = [token_transform(t) for t in l.split('_')]
    l2 = list(itertools.product(*l_primes))
    l_primes = [' '.join(i) for i in l2]  
 
    matched = False
    for s in s_lst:
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l_prime in l_primes:
            if l_prime in s:            
                matched = True
            
    if matched:
        matched_label_l0.append(l)
    else:
        unmatched_label_l0.append(l)  

perc = len(unmatched_label_l0)/(len(matched_label_l0 )+ len(unmatched_label_l0)) 
if perc < 0.05:
    print('Less than 5% labels are not matched: {:.2%}'.format(perc))

# get all labels associated with item
def match_labels(s):
    if s in name_label_0.keys():
        found = name_label_0[s] 
    else:
        found = []
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l in matched_label_l0:
            matched = False
            l_primes = [token_transform(t) for t in l.split('_')]
            for l_prime in [' '.join(i) for i in list(itertools.product(*l_primes))]:
                if l_prime in s:  
                    matched = True
            if matched:
                found.append(l) 
                        
    # full label names
    all_labels = []
    if len(found) > 0:
        full_label_l0 = [index_label[label_index_l0[l0_label]] for l0_label in found]
        for l0 in full_label_l0:
            l2 = l0.split('__')[0]
            l1 = '__'.join(l0.split('__')[:2])
            all_labels.extend([l2, l1, l0])
    return sorted(set(all_labels))

# Two variants: 1) using all tags along the branch; 2) using only the last tag
inv_label_index = {v: " ".join(k.replace("__", " ").split("_")) for k, v in label_index.items()} # use all tags along the branch
# inv_label_index = {v: " ".join(k.split("__")[-1].split("_")) for k, v in label_index.items()} # use only the last tag

Less than 5% labels are not matched: 1.33%


In [14]:
rec_df.loc[:, 'label']  = rec_df['item'].apply(match_labels)
rec_df['label_summary'] = rec_df['label'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])

gt_df.loc[:, 'label']  = gt_df['item_gt'].apply(match_labels)
gt_df['label_summary'] = gt_df['label'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])
gt_df = gt_df[['user', 'item_1', 'label', 'label_summary']]

gt_lst = gt_df.groupby('user')['label_summary'].apply(list).to_dict()
rec_df['l2'] = rec_df['user'].map(gt_lst)

df1 = rec_df.groupby(['user', 'qn'])['label_summary'].apply(list).reset_index().rename(columns={'label_summary': 'l2'})
df1 = gt_df.merge(df1, on='user', how='outer')

# Scoring functions

In [15]:
def get_scores(single_cands, multi_refs):
    P_mul, R_mul, F_mul = scorer.score([single_cands], [multi_refs])
    return F_mul[0]

def sim_score(l0, l1, sim_dict=sim_dict):
    l0, l1 = tuple(sorted([l0, l1]))
    
    if (l0, l1) in sim_dict.keys():
        return sim_dict[(l0, l1)]
    else:
        if l0 == l1: 
            val = 1
        else:
            val = get_scores(l0.lower(), l1.lower())
        sim_dict[(l0, l1)] = val
        return float(val)
    
def label_score(c_cap_j, c_cap_j_prime, weight, sim_dict=sim_dict):
    if len(c_cap_j)==0 or len(c_cap_j_prime)==0:
        return 0
        #return np.nan
    
    numerator = 0.0
    denominator = 0.0

    for c_t in c_cap_j:
        lambda_t = weight[c_t]
        denominator += lambda_t
        max_val = -1
       
        for c_s in c_cap_j_prime:
            val = sim_score(inv_label_index[c_t], inv_label_index[c_s], sim_dict)
            if val > max_val:
                max_val = val
        
        numerator += lambda_t * max_val
    
    return numerator / denominator

def pair_sim_scores(j, j_prime, weight, sim_dict):
    if(len(j)==0) or (len(j_prime)==0):
        return 0
    
    if j_prime == j:
        return 1
    else:
        curr_score = label_score(j, j_prime, weight, sim_dict)
    return float(curr_score)

In [61]:
x = 'Egg Salad Sandwich'
y = 'dreyers grand ice cream - chocolate ice cream'
x = 'Home Made - Garden Salad'
y = 'chips - chips'
y = 'coke - coke (330ml)'
c_cap_j = match_labels(x)
c_cap_j_prime = match_labels(y)
# print(c_cap_j)
# print(c_cap_j_prime)
score = None

if len(c_cap_j)==0 or len(c_cap_j_prime)==0:
    score = 0

# print(score)
# print(labels[labels['food_name']==x.lower()]['label_summary'])
# print(labels[labels['food_name']==y.lower()]['label_summary'])
print(name_label_0[y.lower()])

KeyError: 'coke - coke (330ml)'

# Benchmarking

To compute hMatch_sim from scratch, it takes about (without GPU):
* To be completed

With GPU:
* 1 min for about 32 rows (10 users)
* 2 min for 800 rows
* 30 min for 28,909 rows or one model (6915 users)

# Main

Compute hMatch_sim for each model. Data will be split into batches so that results can be saved to CSV periodically.

In [353]:
start = time.time()
# 'adaloyal', 'bpr', 'fpmc', 'global', 'lda', 'mixture', 'mixture_decay', 'nmf', 'personal', 'sasrec', 'wrmf', 'random'
m = 'random'
start_uid = 0
batch_size = 200
max_uid = 7000
weighing='freq'
col = 'hSim-freq'

def lst_sim_scores(line, weight=weighing_scheme[weighing], sim_dict=sim_dict):
    j = line['label_summary']
    j_primes = line['l2']
    pair_sim_score = 0
    for j_prime in j_primes:
        s = pair_sim_scores(j, j_prime, weight, sim_dict)
        if s > pair_sim_score:
            pair_sim_score = s
    return pair_sim_score

# We select each batch based on UID and models. batch_size=200 normally results in ~9000 rows of data for computing per batch.
while start_uid < max_uid:
    istart = time.time()
    end_uid = start_uid + batch_size  
    
    eval_df = df1[(df1['user']>=start_uid) & (df1['user']<end_uid) & (df1['qn']==m)] # Selecting subset of data
    # Calculating BERTScores, will take quite some time if on a non-CUDA machine
    eval_df[col] = eval_df.apply(lst_sim_scores, axis=1)
    
    iend = time.time()
    eval_df.to_csv('data/temp/hmatch_sim/%s_uid_%s.csv' % (m, start_uid), index=False)
    print('total rows=%s\tstart_uid=%s\tend_uid=%s\telapsed_time=%.2f min' % (len(eval_df), start_uid, end_uid, (iend - istart)/60))
    start_uid += batch_size
    time.sleep(1)

end = time.time()
print('total_time=%.2f min' % ((end - start)/60))

total rows=818	start_uid=0	end_uid=200	elapsed_time=0.56 min
total rows=858	start_uid=200	end_uid=400	elapsed_time=0.58 min
total rows=867	start_uid=400	end_uid=600	elapsed_time=0.60 min
total rows=883	start_uid=600	end_uid=800	elapsed_time=0.45 min
total rows=791	start_uid=800	end_uid=1000	elapsed_time=0.45 min
total rows=867	start_uid=1000	end_uid=1200	elapsed_time=0.51 min
total rows=828	start_uid=1200	end_uid=1400	elapsed_time=0.53 min
total rows=839	start_uid=1400	end_uid=1600	elapsed_time=0.48 min
total rows=833	start_uid=1600	end_uid=1800	elapsed_time=0.47 min
total rows=874	start_uid=1800	end_uid=2000	elapsed_time=0.55 min
total rows=916	start_uid=2000	end_uid=2200	elapsed_time=0.50 min
total rows=773	start_uid=2200	end_uid=2400	elapsed_time=0.42 min
total rows=831	start_uid=2400	end_uid=2600	elapsed_time=0.52 min
total rows=776	start_uid=2600	end_uid=2800	elapsed_time=0.46 min
total rows=801	start_uid=2800	end_uid=3000	elapsed_time=0.44 min
total rows=763	start_uid=3000	end_ui

Join all results into one file

In [354]:
# concatenating multiple results into one
m = 'random'
start_uid = 0
batch_size = 200
max_uid = 7000

big_df = pd.DataFrame()

while start_uid < max_uid:
    try:
        df = pd.read_csv('data/temp/hmatch_sim/%s_uid_%s.csv' % (m, start_uid))
    except FileNotFoundError:
        print('data/temp/hmatch_sim/%s_uid_%s.csv does not exist.' % (m, start_uid))
    
    big_df = pd.concat([big_df, df], ignore_index=True)
    start_uid += batch_size

big_df.to_csv('data/temp/hmatch_sim/%s.csv' % (m), index=False)

Computing aggregated top-k scores for each user and final scores for the model

In [355]:
start = time.time()
user_mean_df = big_df.reset_index().groupby(['user'])[['hSim-freq']].mean()
end = time.time()
print('%.2f sec' % (end-start))
print(m)
print(user_mean_df.mean())

0.01 sec
random
hSim-freq    0.668037
dtype: float64


Persisting computed scores to a pickle file

In [364]:
print(len(sim_dict))
# for k, v in sim_dict.items():
#     print('%s\t%s\t%.2f\t%.2f\t%.2f' % (k[0], k[1], v['P'], v['R'], v['F']))

227864


In [365]:
if len(sim_dict)>=loaded_sim_dict_len:
    save_pickle(BERT_F1_dict_file, sim_dict)
    print('Saved pre-computed scores, %s total records' % len(sim_dict))

Saved pre-computed scores, 227864 total records
