In [1]:
import re
import itertools
import numpy as np
import pandas as pd

from collections import defaultdict
from sklearn.metrics import recall_score

from src.config import mturk_simjudgement_file, mturk_substitution_rating_scores
from src.config import label_file, weighing_scheme, lst_ref
from src.io import load_pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
# some name is written in the users' format for easy user reference
def process_food_name(s1):
    # separators: ", " + any of (integer, decimal & fraction) +" "
    exp = r", \d+\.\d+ |, \d+\,\d+ |, \d+ |, \d+\/\d+ "
    # remove content in parenthesis for finding the separator
    if s1.count('(') == s1.count(')'):
        s2 = re.sub(r'[(].*?[\)]', ' ', s1)
    else:
        s2 = s1
    try:
        split_by = re.findall(exp, s2)[0]
        return clean_name(s1.split(split_by)[0])
    except:
        return clean_name(s2)

def clean_name(name):
    name = name.replace("\t", " ").replace("\n", " ").replace("w/o", " no ").replace("w/", " ")
    return re.sub(' +', ' ', name.strip()).lower()

def token_transform(t):
    tokens = [t, t+'s', t+'es']
    if t[-1] == 'y':
        tokens.append(t[:-1]+'ies')
    return tokens  


In [3]:
import time
start = time.time()
df = pd.read_csv(mturk_simjudgement_file)
df['item_10'] = df['gt_item'].apply(process_food_name)
df['item_20'] = df['rec_item'].apply(process_food_name)
end = time.time()
print(end-start)

0.07882094383239746


In [4]:
# index -> label
labels = load_pickle(label_file)
label_with_food = defaultdict(list)
def group_food_name(line):
    s = line['food_name']
    for i in line['label_summary']:
        label_with_food[i].append(s)
labels.apply(group_food_name, axis=1) 

l0 = set(i for j in labels['label_summary'].tolist() for i in j)
# labels['max'] = labels['label_summary'].apply(lambda s: max(s) if len(s)>0 else 0)

# '_' connected tokens for l0 tags
concat_list = list(zip(labels['cat_info'], labels['label_summary']))
label_index = dict()
for i,j in concat_list:
    label_index.update(zip(i,j))  
index_label = {k:v for v,k in label_index.items()}
label_index_l0 = {i.split('__')[-1]:v for i,v in label_index.items() if len(i.split('__'))==3}
l0_tags = sorted(label_index_l0.keys())


label_name_l0 = {}
for l,i in label_index_l0.items():
    label_name_l0[l] = label_with_food[i]

name_labels = labels.set_index('food_name').to_dict()['cat_info']
name_label_0 = {k:[s.split('__')[-1] for s in v if len(s.split('__'))==3] for k,v in name_labels.items()}

matched_label_l0, unmatched_label_l0 = [], []
for l, s_lst in label_name_l0.items():    
    l_primes = [token_transform(t) for t in l.split('_')]
    l2 = list(itertools.product(*l_primes))
    l_primes = [' '.join(i) for i in l2]  
 
    matched = False
    for s in s_lst:
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l_prime in l_primes:
            if l_prime in s:            
                matched = True
            
    if matched:
        matched_label_l0.append(l)
    else:
        unmatched_label_l0.append(l)        

# get all labels associated with item
def match_labels(s):
    if s in name_label_0.keys():
        found = name_label_0[s] 
    else:
        found = []
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l in matched_label_l0:
            matched = False
            l_primes = [token_transform(t) for t in l.split('_')]
            for l_prime in [' '.join(i) for i in list(itertools.product(*l_primes))]:
                if l_prime in s:  
                    matched = True
            if matched:
                found.append(l) 
                        
    # full label names
    all_labels = []
    if len(found) > 0:
        full_label_l0 = [index_label[label_index_l0[l0_label]] for l0_label in found]
        for l0 in full_label_l0:
            l2 = l0.split('__')[0]
            l1 = '__'.join(l0.split('__')[:2])
            all_labels.extend([l2, l1, l0])
    return sorted(set(all_labels))     

In [5]:
import time
start = time.time()
df['item_l1'] = df['item_10'].apply(match_labels)
df['item_l2'] = df['item_20'].apply(match_labels)
df['l1'] = df['item_l1'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])
df['l2'] = df['item_l2'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])
end = time.time()
print(end-start)

0.05189657211303711


In [6]:
# from sklearn.metrics import recall_score 
item_vec_dic = {}

def h_recall_vectorize(gt_item, c_item, weight):
    # ground truth, prediction, weight on labels
    # compute for each ground truth item, h-recall for 1 candidate
    return _h_recall_vectorize(gt_item, c_item, weight)

def _h_recall_vectorize(gt_item, c_item, weight):
    if c_item == gt_item:
        return 1
    else:
        # recall score: out of the ground truth item, # of predictions
        # recall_score(y_true, y_pred, sample_weight=None)
        return recall_score(get_vector(gt_item), get_vector(c_item), sample_weight=weight)

def get_vector(item, item_vec_dic=item_vec_dic, shape=len(lst_ref)):
    if tuple(item) in item_vec_dic.keys():
        return item_vec_dic[tuple(item)]
    else:
        vec = np.zeros(shape, dtype=int)
        vec[item] = 1
        item_vec_dic.update({tuple(item):vec})
        return vec

def h_recall(gt_item, c_item):
    return len(set(c_item) & set(gt_item))/ len(set(gt_item))

def get_hr(weight, df=df, column_name='hR'):
    def h_recall_df(row, weight=weight):
        if isinstance(weight, bool):
            return h_recall(row['l1'], row['l2'])
        else:
            return h_recall_vectorize(row['l1'], row['l2'], weight)
    df[column_name] = df.apply(h_recall_df, axis=1)
    return df

In [7]:
import time
start = time.time()
df = get_hr(weight=False, df=df, column_name='hMatch-1')

ks = ['124', 'freq']
for k in ks:
    weight = weighing_scheme[k]
    df = get_hr(weight=weight, df=df, column_name='hMatch_'+ k)
end = time.time()
print(end-start)

12.184744119644165


In [8]:
cols = ['gt_item', 'rec_item', 'gt_item_id', 'rec_item_id', 'hint', 'SimJudgement', 
        'hMatch-1', 'hMatch_124', 'hMatch_freq']
df = df[cols]
df.columns = ['gt_item', 'rec_item', 'gt_item_id', 'rec_item_id', 'hint', 'SimJudgement', 
              'hMatch-1', 'hMatch-2', 'hMatch-freq']

In [9]:
filename = mturk_substitution_rating_scores['hMatch']
df.to_csv(filename, index=False)