In [1]:
import re
import pandas as pd
from scipy.stats import zscore
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.translate.bleu_score import sentence_bleu

from src.config import substitution_rating_file, substitution_rating_scores

In [2]:
df = pd.read_csv(substitution_rating_file)
df['identical'] = df['item_1'] == df['item_2']
df['identical'] = df['identical'].astype(int)
df.head(2)

Unnamed: 0,user,item_1,item_2,rating,identical
0,1,Bottle Water - Water,amy's - asian noodle stir-fry,1,0
1,1,Bottle Water - Water,apple,1,0


# BLEU
The modified precision for a recommended item would then be equivalent to the maximum precision of the multiple reference items (i.e. ground truth items).

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
def clean_text(s):
    try:
        return ' '.join(tokenizer.tokenize(re.sub(r'[^a-zA-Z ]',r'', s).lower()))
    except:
        print(s)
        return ''

def get_weighted_scores(single_cands, multi_refs, weights=(0.25, 0.25, 0.25, 0.25)): 
    hypothesis = word_tokenize(clean_text(single_cands))
    reference = [word_tokenize(clean_text(ref)) for ref in multi_refs] 
    return sentence_bleu(reference, hypothesis, weights=weights)

weight_scheme = {'BLEU-1':(1,0,0,0), 'BLEU-2':(0.5, 0.5, 0, 0)}
for m, w in weight_scheme.items():
    def weighted_scores(line):
        single_cands = line['item_1']
        multi_refs = [line['item_2']]
        return get_weighted_scores(single_cands, multi_refs, weights=w)
    
    df[m]= df.apply(weighted_scores, axis=1)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [4]:
filename = substitution_rating_scores['BLEU']
df.to_csv(filename, index=False)

# Normalization

In [5]:
cols = [ 'rating_z', 'BLEU-1', 'BLEU-2', 'identical']
dfs = []
for u, df_temp in df.groupby('user'):
    df_temp['rating_z'] = zscore(df_temp['rating'])
    dfs.append(df_temp)
d1_z = pd.concat(dfs)
r2 = d1_z[cols].corr().head(1)
print(' & '.join([str(s) for s in r2.round(3).values[0][1:]]), '\n')
r2

0.305 & 0.242 & 0.041 



Unnamed: 0,rating_z,BLEU-1,BLEU-2,identical
rating_z,1.0,0.304869,0.242221,0.040699
