In [1]:
import re
import pandas as pd

from scipy.stats import zscore
from nltk.tokenize import RegexpTokenizer
from rouge import Rouge 

from src.config import substitution_rating_file, substitution_rating_scores

In [2]:
rouge = Rouge()
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(s):
    try:
        return ' '.join(tokenizer.tokenize(re.sub(r'[^a-zA-Z ]',r'', s).lower()))
    except:
        print(s)
        return ''

def get_rouge_scores(single_cands, reference):
    s = rouge.get_scores(clean_text(single_cands), clean_text(reference))[0]
    lst = []
    for m0 in ['rouge-1', 'rouge-2', 'rouge-l']:
        for m in ['f', 'p', 'r']: 
            lst.append(s[m0][m])
    return lst
                
def rouge_scores(line):
    single_cands = line['item_1']
    multi_refs = line['item_2']
    return get_rouge_scores(single_cands, multi_refs)

In [3]:
df = pd.read_csv(substitution_rating_file)

df['ROUGE-1_f'], df['ROUGE-1_p'], df['ROUGE-1_r'], df['ROUGE-2_f'], \
 df['ROUGE-2_p'], df['ROUGE-2_r'], df['ROUGE-L_f'], df['ROUGE-L_p'], \
 df['ROUGE-L_r'] = zip(*df.apply(rouge_scores, axis=1))

cols = ['user', 'item_1', 'item_2', 'rating', 'ROUGE-1_r', 'ROUGE-2_r', 'ROUGE-L_r']
df = df[cols].rename(columns={c:c.replace('_r', '') for c in cols})

In [4]:
filename = substitution_rating_scores['ROUGE']
df.to_csv(filename, index=False)

# Normalization

In [5]:
cols = ['rating_z', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
dfs = []
for u, df_temp in df.groupby('user'):
    df_temp['rating_z'] = zscore(df_temp['rating'])
    dfs.append(df_temp)
d1_z = pd.concat(dfs)
r2 = d1_z[cols].corr().head(1)
print(' & '.join([str(s) for s in r2.round(3).values[0][1:]]), '\n')
r2

0.318 & 0.242 & 0.318 



Unnamed: 0,rating_z,ROUGE-1,ROUGE-2,ROUGE-L
rating_z,1.0,0.31829,0.241879,0.317582
