In [1]:
import re
import itertools
import numpy as np
import pandas as pd

from scipy.stats import zscore
from collections import defaultdict
from sklearn.metrics import precision_score 

from src.config import substitution_rating_file, label_file, substitution_rating_scores, weighing_scheme
from src.io import load_pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
# some name is written in the users' format for easy user reference
def process_food_name(s1):
    # separators: ", " + any of (integer, decimal & fraction) +" "
    exp = r", \d+\.\d+ |, \d+\,\d+ |, \d+ |, \d+\/\d+ "
    # remove content in parenthesis for finding the separator
    if s1.count('(') == s1.count(')'):
        s2 = re.sub(r'[(].*?[\)]', ' ', s1)
    else:
        s2 = s1
    try:
        split_by = re.findall(exp, s2)[0]
        return clean_name(s1.split(split_by)[0])
    except:
        return clean_name(s2)

def clean_name(name):
    name = name.replace("\t", " ").replace("\n", " ").replace("w/o", " no ").replace("w/", " ")
    return re.sub(' +', ' ', name.strip()).lower()

def token_transform(t):
    tokens = [t, t+'s', t+'es']
    if t[-1] == 'y':
        tokens.append(t[:-1]+'ies')
    return tokens  


In [3]:
import time
start = time.time()
df = pd.read_csv(substitution_rating_file)
df['item_10'] = df['item_1'].apply(process_food_name)
df['item_20'] = df['item_2'].apply(process_food_name)
end = time.time()
print(end - start)

0.060837507247924805


In [4]:
# index -> label
labels = load_pickle(label_file)
label_with_food = defaultdict(list)
def group_food_name(line):
    s = line['food_name']
    for i in line['label_summary']:
        label_with_food[i].append(s)
labels.apply(group_food_name, axis=1) 

l0 = set(i for j in labels['label_summary'].tolist() for i in j)
# labels['max'] = labels['label_summary'].apply(lambda s: max(s) if len(s)>0 else 0)

# '_' connected tokens for l0 tags
concat_list = list(zip(labels['cat_info'], labels['label_summary']))
label_index = dict()
for i,j in concat_list:
    label_index.update(zip(i,j))  
index_label = {k:v for v,k in label_index.items()}
label_index_l0 = {i.split('__')[-1]:v for i,v in label_index.items() if len(i.split('__'))==3}
l0_tags = sorted(label_index_l0.keys())


label_name_l0 = {}
for l,i in label_index_l0.items():
    label_name_l0[l] = label_with_food[i]

name_labels = labels.set_index('food_name').to_dict()['cat_info']
name_label_0 = {k:[s.split('__')[-1] for s in v if len(s.split('__'))==3] for k,v in name_labels.items()}

matched_label_l0, unmatched_label_l0 = [], []
for l, s_lst in label_name_l0.items():    
    l_primes = [token_transform(t) for t in l.split('_')]
    l2 = list(itertools.product(*l_primes))
    l_primes = [' '.join(i) for i in l2]  
 
    matched = False
    for s in s_lst:
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l_prime in l_primes:
            if l_prime in s:            
                matched = True
            
    if matched:
        matched_label_l0.append(l)
    else:
        unmatched_label_l0.append(l)        

# get all labels associated with item
def match_labels(s):
    if s in name_label_0.keys():
        found = name_label_0[s] 
    else:
        found = []
        s = ' '.join(s.replace("'", '').replace('&', ' ').split())
        for l in matched_label_l0:
            matched = False
            l_primes = [token_transform(t) for t in l.split('_')]
            for l_prime in [' '.join(i) for i in list(itertools.product(*l_primes))]:
                if l_prime in s:  
                    matched = True
            if matched:
                found.append(l) 
                        
    # full label names
    all_labels = []
    if len(found) > 0:
        full_label_l0 = [index_label[label_index_l0[l0_label]] for l0_label in found]
        for l0 in full_label_l0:
            l2 = l0.split('__')[0]
            l1 = '__'.join(l0.split('__')[:2])
            all_labels.extend([l2, l1, l0])
    return sorted(set(all_labels))     

In [5]:
import time
start = time.time()
df['item_l1'] = df['item_10'].apply(match_labels)
df['item_l2'] = df['item_20'].apply(match_labels)
df['l1'] = df['item_l1'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])
df['l2'] = df['item_l2'].apply(lambda s: [label_index[i] for i in s] if len(s)>0 else [])
end = time.time()
print(end - start)

23.945932149887085


In [6]:
def get_hp(gt_lst, rec_lst, weight):
    scores = [h_precision_vectorize(gt_lst, rec, weight) for rec in rec_lst]
    # return np.nan if rec_lst is []
    return np.mean(scores)

def h_precision_vectorize(gt_lst, rec, weight):
    x = np.zeros((len(gt_lst), weight.shape[0]), dtype=int)
    for i, gt_item in enumerate(gt_lst):
        x[i, gt_item] = 1
    y = np.zeros(weight.shape[0], dtype=int)
    y[rec] = 1
    return max([precision_score(xi, y, sample_weight=weight) for xi in x])

def get_hr(gt_lst, rec_lst, weight):
    return  get_hp(rec_lst, gt_lst, weight)

def get_hr_1(line):
    return get_hr([line['l1']], [line['l2']], weight=weighing_scheme['equal'])

def get_hr_2(line):
    return get_hr([line['l1']], [line['l2']], weight=weighing_scheme['124'])

def get_hr_freq(line):
    return get_hr([line['l1']], [line['l2']], weight=weighing_scheme['freq'])

import time
start = time.time()
df['hMatch-1'] = df.apply(get_hr_1, axis=1)
df['hMatch-2'] = df.apply(get_hr_2, axis=1)
df['hMatch-freq'] = df.apply(get_hr_freq, axis=1)
end = time.time()
print(end - start)

14.289854526519775


In [7]:
cols = ['user', 'item_1', 'item_2', 'rating', 'hMatch-1', 'hMatch-2', 'hMatch-freq']
df = df[cols]
filename = substitution_rating_scores['hMatch']
df.to_csv(filename, index=False)

# Normalize

In [None]:
# cols = [ 'rating_z', 'hMatch-1', 'hMatch-2', 'hMatch-freq']

# from scipy.stats import zscore
# dfs = []
# for u, df_temp in d1.groupby('user'):
#     df_temp['rating_z'] = zscore(df_temp['rating'])
#     dfs.append(df_temp)
# d1_z = pd.concat(dfs)
# r2 = d1_z[cols].corr().head(1).round(3)
# print(' & '.join([str(s) for s in r2.values[0][1:]]), '\n')
# r2