In [236]:
import pandas as pd
import numpy as np
from data_reader import read_df, interaction_matrix

def user_gold(df, max_k=20):
    # user2index, item2index, inteRaction matrix
    u2i, i2i, R = interaction_matrix(df)

    u_len = (R > 0).sum(axis=1)
    K = min(u_len.max(), max_k)

    fav = dict([(user, group.tolist()[::-1]) for user, group in df.groupby('user_id')['item_id']])
    gold = { u2i[user]:[i2i[i] for i in items] for user,items in fav.items() }

    # G = np.ones((len(u2i), K)) * np.nan

    # for i in range(G.shape[0]):
    #     g = gold[i]
    #     G[i, :u_len[i]] = g[:max_k]

    return R, gold, K, u_len


def topk(R, k):
    '''
    returns index of top k items, sorted
    '''

    A = R.argpartition(kth=-k, axis=1, kind='stable')[:, -k:]
    S = np.take_along_axis(R, A, axis=1)
    return np.take_along_axis(A, S.argsort(), axis=1)[:,::-1]


def eval_score(user, fn, guess, gold, u_len):
    '''
    utility function for recall and ndcg
    '''
    
    n = u_len[user]
    retrieved = guess[user, :n]
    gold_n = gold[user][:n]

    return fn(gold_n, retrieved)


def recall_fn(gold, sorting):
    best = sorting[:20]
    return len(set(best).intersection(gold)) / len(gold)


def dcg(gold, guess, return_values=False, p=20):
    p = min(p, len(gold), len(guess))
    guess = guess[:p]

    # relevance
    rel = [int(item in gold) for item in guess][:p]
    rel = np.array(rel) / np.hstack((1, np.log(2 + np.arange(len(rel)-1),)/np.log(2)))
    
    score = rel.sum()
    return (score, rel.cumsum()) if return_values else score


def ndcg_fn(gold, guess, return_values=False, p=20):
    p = min(p, len(gold), len(guess))

    if return_values:
        _, ideal = dcg(gold, gold, return_values=True)
        _, rel = dcg(gold, guess, return_values=True)
        ndcg = (rel/ideal)
        
        return ndcg[0], ndcg

    ideal = dcg(gold, gold)
    rel = dcg(gold, guess)
    
    return rel/ideal


df = read_df('test')
users = set(df['user_id'])

R, gold, K, u_len = user_gold(df)

In [139]:
def rank_correlation_fn(gold, scores):
    '''
    scores: user'th row of scoring matrix S (approximation for R)
    '''
   
    n = len(gold)
    a = np.arange(n)
    b = scores[gold].argsort()

    # spearman
    return 1 - (6*(b - a)**2 / (n * (n**2-1))).sum()


def rank_correlation(gold, S):
    n = len(gold)
    scores = [rank_correlation_fn(gold[u], S[u]) for u in range(n)]
    
    return np.array(scores)

In [261]:
rank_correlation(gold, R)

array([1., 1., 1., ..., 1., 1., 1.])

In [156]:
guess = topk(R, K)
# fn = recall
fn = rank_correlation_fn
scores = [eval_score(user, fn, guess, gold) for user in range(len(users))]

In [200]:
# archived code

def test_sorting():
    user = 'A2G04D4QZAXL15'
    items = fav[user] + fav['A3JT29L4YFEIMJ']
    np.random.shuffle(items)
    i = items[0:5]
    return fav[user], i

def rc_test():
    user, i = test_sorting()
    fav[user], i, recall_fn(fav[user], i)

def ndcg_test():
    ex = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    fav_ex = sorted(ex, reverse=True)
    print('DCG', dcg(fav_ex, ex, return_values=True)[1])
    print('NDCG', ndcg_fn(fav_ex, ex, True)[1])
    sc, rel = dcg(*test_sorting(), True)
    return sc, rel


def calc_gamma(gold, guess):
    import numpy as np

    def dm(a):
        a = np.array(a)
        n = len(a)
        A = np.zeros((n, n))
        for i in range(n):
            A[i, :] = a - a[i]
        return A

    A, B = dm(guess), dm(gold)
    gamma = lambda A,B: (A*B).sum() / np.sqrt((A*A).sum() * (B*B).sum())

    return gamma


def rank_correlation_fn(gold, rank_fn):
    n = len(gold)

    a = np.arange(n) + 1
    b = np.array([rank_fn(item) for item in gold])

    # spearman
    return 1 - 6*(b - a)**2 / n / (n**2 - 1)

# def test_rank_correlation

def rank_correlation_fn(gold, ranked):
    n = len(gold)

    a = np.arange(n) + 1
    b = np.array(ranked)

    # spearman
    return 1 - (6*(b - a)**2 / n / (n**2 - 1)).sum()

def extract_rank(gold, retrieved):
    n = len(gold)
    return [np.argwhere(gold[i] == retrieved)[0][0]+1 for i in range(n)]


ex = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
gold = sorted(ex, reverse=True)

ts = test_sorting()[1]
st = ts[:]
np.random.shuffle(st)

st = np.array(st)
ts = np.array(ts)

rank_correlation_fn(ts, extract_rank(ts, st))