In [2]:
import pandas as pd
import numpy as np
from data_reader import read_df

df = read_df('test')
df.head()

Unnamed: 0,user_id,item_id,rate,review_text
0,A35HOUWHAYZZN6,B00JF3RYPM,1,This lotion gently moisturizes without irritat...
1,A1DGB6HY5C7LXU,B00KLA4INE,1,I've been doing some research into non-surgica...
2,A3EPHBMU07LZ50,B00CBD0M8Y,1,I could not detectably notice much difference ...
3,A1WSQ0QRWQC7VI,B001FSK8SA,1,arived just in time for xmas so i used it as a...
4,A2SZLNSI5KOQJT,B00D18ZDM8,1,"In my opinion, the Bic Soleil Shave & Trim Sha..."


In [3]:
users = set(df['user_id'])
fav = {u:df[df['user_id']==u]['item_id'].tolist()[::-1] for u in users}
golds = {u:list(enumerate(fav[u])) for u in users}

u = 'A3EPHBMU07LZ50'
fav[u]

['B001OMI93S', 'B00AE07GX0', 'B00AE07FQI', 'B00CBD0M8Y']

In [12]:
def user_favorites(df):
    return dict([(user, group.tolist()[::-1]) for user, group in df.groupby('user_id')['item_id']])

fav = user_favorites(df)

['B001OMI93S', 'B00AE07GX0', 'B00AE07FQI', 'B00CBD0M8Y']

In [9]:
def recall(gold, sorting):
    best = sorting[:20]
    return len(set(best).intersection(gold)) / len(gold)

def test_sorting():
    user = 'A2G04D4QZAXL15'
    items = fav[user] + fav['A3JT29L4YFEIMJ']
    np.random.shuffle(items)
    i = items[0:5]
    return fav[user], i

def rc_test():
    user, i = test_sorting()
    fav[user], i, recall(fav[user], i)

In [10]:
def dcg(gold, sorting, return_values=False, p=20):
    best = sorting[:p]

    # relevance
    rel = [int(item in gold) for item in best][:p]
    rel = np.array(rel) / np.hstack((1, np.log(2 + np.arange(len(rel)-1),)/np.log(2)))
    
    score = rel.sum()
    return (score, rel.cumsum()) if return_values else score

def ndcg(gold, sorting, return_values=False):
    _, ideal = dcg(gold, gold, return_values=True)
    _, rel = dcg(gold, sorting, return_values=True)
    ndcg = (rel/ideal)

    return ndcg.sum() if not return_values else (ndcg.sum(), ndcg)

def ndcg_test():
    ex = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    fav_ex = sorted(ex, reverse=True)
    print('DCG', dcg(fav_ex, ex, return_values=True)[1])
    print('NDCG', ndcg(fav_ex, ex, True)[1])
    sc, rel = dcg(*test_sorting(), True)
    return sc, rel

In [61]:
def rank_correlation_fn(gold, rank_fn):
    n = len(gold)

    a = np.arange(n) + 1
    b = np.array([rank_fn(item) for item in gold])

    # spearman
    return 1 - 6*(b - a)**2 / n / (n**2 - 1)

# def test_rank_correlation

def rank_correlation(gold, ranked):
    n = len(gold)

    a = np.arange(n) + 1
    b = np.array(ranked)

    # spearman
    return 1 - (6*(b - a)**2 / n / (n**2 - 1)).sum()

def extract_rank(gold, retrieved):
    n = len(gold)
    return [np.argwhere(gold[i] == retrieved)[0][0]+1 for i in range(n)]



In [12]:
ex = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
gold = sorted(ex, reverse=True)

In [27]:
def calc_gamma():
    import numpy as np

    def dm(a):
        a = np.array(a)
        n = len(a)
        A = np.zeros((n, n))
        for i in range(n):
            A[i, :] = a - a[i]
        return A

    A, B = dm(ex), dm(gold)
    gamma = lambda A,B: (A*B).sum() / np.sqrt((A*A).sum() * (B*B).sum())

    return gamma

In [35]:
ts = test_sorting()[1]


In [38]:
st = ts[:]
np.random.shuffle(st)

In [45]:
st = np.array(st)
ts = np.array(ts)

In [64]:
rank_correlation(ts, extract_rank(ts, st))

0.29999999999999993

In [59]:
ts

array(['B0098JU8T2', 'B005HIHD2I', 'B00AE0790U', 'B005TI7NQW',
       'B00HB2JQNM'], dtype='<U10')