In [1]:
import re
import itertools
import pandas as pd
import numpy as np

from copy import deepcopy
from collections import defaultdict
from sklearn.metrics import recall_score, precision_score 

from src.config import substitution_rating_file, preference_rating_file, preference_rating_scores
from src.config import label_file, weighing_scheme, lst_ref
from src.io import load_pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
def process_food_name(s1):
    # separators: ", " + any of (integer, decimal & fraction) +" "
    exp = r", \d+\.\d+ |, \d+\,\d+ |, \d+ |, \d+\/\d+ "
    # remove content in parenthesis for finding the separator
    if s1.count('(') == s1.count(')'):
        s2 = re.sub(r'[(].*?[\)]', ' ', s1)
    else:
        s2 = s1
    try:
        split_by = re.findall(exp, s2)[0]
        return clean_name(s1.split(split_by)[0])
    except:
        return clean_name(s2)

def clean_name(name):
    name = name.replace("\t", " ").replace("\n", " ").replace("w/o", " no ").replace("w/", " ")
    return re.sub(' +', ' ', name.strip()).lower()

In [3]:
# ground truth
df2 = pd.read_csv(substitution_rating_file)[['user', 'item_1']].drop_duplicates().reset_index(drop=True)
df2 = df2[['user', 'item_1']]
df2['item_gt'] = df2['item_1'].apply(process_food_name)

# predictions
df = pd.read_csv(preference_rating_file)
df['item'] = df['choice'].apply(process_food_name)
gt = df2.groupby('user')['item_gt'].apply(list).to_dict()
df['gt'] = df['user'].map(gt)

In [4]:
labels = load_pickle(label_file)
concat_list = list(zip(labels['cat_info'], labels['label_summary']))
label_index = dict()
for i,j in concat_list:
    label_index.update(zip(i,j))  

index_label = {k:v for v,k in label_index.items()}

# '_' connected tokens for l0 tags
label_index_l0 = {i.split('__')[-1]:v for i,v in label_index.items() if len(i.split('__'))==3}
l0_tags = sorted(label_index_l0.keys())

label_with_food = defaultdict(list)
def group_food_name(line):
    s = line['food_name']
    for i in line['label_summary']:
        label_with_food[i].append(s)
labels.apply(group_food_name, axis=1) 

label_name_l0 = {}
for l,i in label_index_l0.items():
    label_name_l0[l] = label_with_food[i]

name_labels = labels.set_index('food_name').to_dict()['cat_info']
name_label_0 = {k:[s.split('__')[-1] for s in v if len(s.split('__'))==3] for k,v in name_labels.items()}

matched_label_l0 = []
unmatched_label_l0 = []

def token_transform(t):
    tokens = [t, t+'s', t+'es']
    if t[-1] == 'y':
        tokens.append(t[:-1]+'ies')
    return tokens  

for l, s_lst in label_name_l0.items():    
    l_primes = [token_transform(t) for t in l.split('_')]
    l2 = list(itertools.product(*l_primes))
    l_primes = [' '.join(i) for i in l2]  
 
    matched = False
    for s in s_lst:
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l_prime in l_primes:
            if l_prime in s:            
                matched = True
            
    if matched:
        matched_label_l0.append(l)
    else:
        unmatched_label_l0.append(l)  

perc = len(unmatched_label_l0)/(len(matched_label_l0 )+ len(unmatched_label_l0)) 
if perc < 0.05:
    print('Less than 5% labels are not matched: {:.2%}'.format(perc))

# get all labels associated with item
def match_labels(s):
    if s in name_label_0.keys():
        found = name_label_0[s] 
    else:
        found = []
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l in matched_label_l0:
            matched = False
            l_primes = [token_transform(t) for t in l.split('_')]
            for l_prime in [' '.join(i) for i in list(itertools.product(*l_primes))]:
                if l_prime in s:  
                    matched = True
            if matched:
                found.append(l) 
                        
    # full label names
    all_labels = []
    if len(found) > 0:
        full_label_l0 = [index_label[label_index_l0[l0_label]] for l0_label in found]
        for l0 in full_label_l0:
            l2 = l0.split('__')[0]
            l1 = '__'.join(l0.split('__')[:2])
            all_labels.extend([l2, l1, l0])
    return sorted(set(all_labels))  

df.loc[:, 'label']  = df['item'].apply(match_labels)
df['label_summary'] = df['label'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])

df2.loc[:, 'label']  = df2['item_gt'].apply(match_labels)
df2['label_summary'] = df2['label'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])
df2 = df2[['user', 'item_1', 'label', 'label_summary']]

Less than 5% labels are not matched: 1.33%


In [5]:
def h_precision(gt_items, prediction):
    return max([len(set(prediction) & set(gt_item))/ len(set(prediction)) for gt_item in gt_items])

def h_precision_vectorize(gt_items, prediction, weight):
    x = np.zeros((len(gt_items), weight.shape[0]), dtype=int)
    for i, gt_item in enumerate(gt_items):
        x[i, gt_item] = 1
    y = np.zeros(weight.shape[0], dtype=int)
    y[prediction] = 1
    return max([precision_score(xi, y, sample_weight=weight) for xi in x])

def get_hp(qn, weight, df, df2):
    # Get ground truth
    gt_lst = df2.groupby(['user'])['label_summary'].apply(list).reset_index(name='gt_lst')
    
    def h_precision_df(row, weight=weight):
        if isinstance(weight, bool):
            return h_precision(row['gt_lst'], row['label_summary'])
        else:
            return h_precision_vectorize(row['gt_lst'], row['label_summary'], weight)
    
    df1 = df[df['qn']==qn]
    df1 = pd.merge(df1, gt_lst, how='left', on=['user'])
    df1['precision']  = df1.apply(h_precision_df, axis=1)

    return df1

In [6]:
item_vec_dic = {}

def h_recall_vectorize(gt_item, candidates, weight):
    # ground truth, prediction, weight on labels
    # compute for each ground truth item, maximum h-recall among all candidates
    return max([_h_recall_vectorize(gt_item, c_item, weight) for c_item in candidates])

def _h_recall_vectorize(gt_item, c_item, weight):
    if c_item == gt_item:
        return 1
    else:
        # recall score: out of the ground truth item, # of predictions
        # recall_score(y_true, y_pred, sample_weight=None)
        return recall_score(get_vector(gt_item), get_vector(c_item), sample_weight=weight)

def get_vector(item, item_vec_dic=item_vec_dic, shape=len(lst_ref)):
    if tuple(item) in item_vec_dic.keys():
        return item_vec_dic[tuple(item)]
    else:
        vec = np.zeros(shape, dtype=int)
        vec[item] = 1
        item_vec_dic.update({tuple(item):vec})
        return vec

def h_recall(gt_item, candidates):
    return max([len(set(c_item) & set(gt_item))/ len(set(gt_item)) for c_item in candidates])

def get_hr(qn, weight, df=df, df2=df2):
    temp_df = deepcopy(df2)
    df1 = df[df['qn']==qn]
    temp_df['candidate_lst'] = temp_df['user'].map(df1.groupby(['user'])['label_summary'].apply(list).to_dict())

    def h_recall_df(row, weight=weight):
        if isinstance(weight, bool):
            return h_recall(row['label_summary'], row['candidate_lst'])
        else:
            return h_recall_vectorize(row['label_summary'], row['candidate_lst'], weight)

    # df2 is ground truth, evaluate based on ground truth
    temp_df['recall'] = temp_df.apply(h_recall_df, axis=1)
    temp_df['qn'] = qn
    
    return temp_df

In [7]:
import time
start = time.time()
scores = defaultdict(lambda: defaultdict(int))
for k, weight in weighing_scheme.items():
    for qn in np.arange(1,6,1):
        scores['hP_'+ k].update({qn:get_hp(qn, weight, df=df, df2=df2)})
        scores['hR_'+ k].update({qn:get_hr(qn, weight, df=df, df2=df2)})

# precision
dfs = []
for met in ['hP_equal', 'hP_124', 'hP_freq',] :
    col, cols = 'precision', ['user', 'qn', 'item']
    dfs.append(pd.concat([scores[met][qn][cols + [col]].rename(columns={col:met})
                          for qn in np.arange(1,6,1)]).set_index(cols))
d = pd.concat(dfs, axis=1).reset_index()
d.columns = ['user', 'qn', 'rec_item'] + ['hP-1', 'hP-2', 'hP-freq']

d.to_csv(preference_rating_scores['hP'], index=False)

# recall
dfs = []
for met in ['hR_equal', 'hR_124', 'hR_freq',] :
    col, cols = 'recall', ['user', 'qn', 'item_1']
    dfs.append(pd.concat([scores[met][qn][cols + [col]].rename(columns={col:met})
                          for qn in np.arange(1,6,1)]).set_index(cols))
d = pd.concat(dfs, axis=1).reset_index()
d.columns = ['user', 'qn', 'gt_item'] + ['hR-1', 'hR-2', 'hR-freq']
d.to_csv(preference_rating_scores['hR'], index=False)
end = time.time()
print(end-start)

88.89433526992798
