In [1]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from numpy import count_nonzero

dir_ = '../../../../data/'
group_dir_ = '../../../../data/groups/random'

In [2]:
file_name = 'normalized_log_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.pkl'
pop_count = pd.read_pickle(os.path.join(dir_, file_name))

num_user = len(svd['uid'].unique())
num_user

953

In [3]:
groups_n =[]
group_sizes = ['2', '3', '4', '5', '6', '7', '8', '9', '10']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, group_sizes[i] + '.csv'), delimiter=',')
    groups_n.append(groups)

In [4]:
svd['count'] = 1
svd[:5]

Unnamed: 0,uid,tid,rating,count
0,0,4,0.601343,1
1,0,14,0.63656,1
2,0,19,0.608919,1
3,0,21,0.581061,1
4,0,27,0.590191,1


In [5]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count[:10]

Unnamed: 0,tid,count,rating
70,70,17557,2980.75
1102,1102,6926,1484.25
83,83,6571,1796.75
13496,13496,6299,1669.75
952,952,6234,1527.5
2519,2519,6001,1772.25
4008,4008,5676,1008.25
5521,5521,5658,1727.25
3900,3900,5627,1475.5
2241,2241,5571,1183.75


In [6]:
# Linear
# size = int(len(pop_rating) // 5)
# pop_normalized = pd.DataFrame()
# for i in range(5):
#     if i == 0:
#         pop = pop_count[:size].copy()
#     elif i == 4:
#         pop = pop_count[size*4:].copy()
#     else:
#         pop = pop_count[size*i:size*(i+1)].copy()
#     pop['rating'] = (5 - i)
#     pop_normalized = pop_normalized.append(pop)
# pop_normalized = pop_normalized[['tid', 'rating']]

In [7]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized[:10]

Unnamed: 0,tid,count,rating
70,70,17557,5.0
1102,1102,6926,1.972433
83,83,6571,1.871333
13496,13496,6299,1.793871
952,952,6234,1.77536
2519,2519,6001,1.709005
4008,4008,5676,1.616449
5521,5521,5658,1.611323
3900,3900,5627,1.602495
2241,2241,5571,1.586547


In [8]:
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    for i in range(len(r)):
        r[i] = 2**r[i] -1
    
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, r_max, k, method=0):
    dcg_max = dcg_at_k(r_max, k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [9]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [10]:
precision_list = []
recall_list = []
nDCG_list = []
for lambda_ in tqdm(lambdas):
    top_n_size = 50
    top_n_lists_g = []
    for groups in groups_n:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    avg_precisions = []
    avg_recalls = []
    avg_nDCGs = []
    for i in range(len(groups_n)):
        groups = groups_n[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        recalls = []
        nDCGs = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]
            
            for k in range(len(group)):
                uid = group[k]
                high_rating = 0
                truth_rating = [] # For nDCG
                for l in range(top_n_size):
                    tid = top_n_list[l]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0:
                        high_rating += 1
                        truth_rating.append(t.iloc[0]['rating']) # For nDCG
                    else:
                        truth_rating.append(0) # For nDCG

                precision = high_rating / top_n_size
                recall = high_rating / len(test[test['uid'] == uid])
                
                max_rating = test[test['uid']==uid].sort_values(by=['rating'],  ascending=False)['rating'].values[:top_n_size]
                nDCG = ndcg_at_k(truth_rating, max_rating, top_n_size, method=1)
                
                precisions.append(precision)
                recalls.append(recall)
                nDCGs.append(nDCG)  

        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        avg_precisions.append(avg_precision)

        avg_recall = 0
        for recall in recalls:
            avg_recall += recall
        avg_recall /= len(recalls)
        avg_recalls.append(avg_recall)
        
        avg_nDCG = 0
        for nDCG in nDCGs:
            avg_nDCG += nDCG
        avg_nDCG /= len(nDCGs)
        avg_nDCGs.append(avg_nDCG)
        
    precision_list.append(avg_precisions)
    recall_list.append(avg_recalls)
    nDCG_list.append(avg_nDCGs)

for p in precision_list:
    print(p)
print()
for r in recall_list:
    print(r)
print()   
for n in nDCG_list:
    print(n)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11.0), HTML(value='')))


[0.09285714285714287, 0.09295478443743424, 0.09292016806722703, 0.09311578947368412, 0.09320675105485239, 0.09285714285714289, 0.09292016806722692, 0.09329100529100541, 0.09286315789473679]
[0.09657563025210092, 0.09566771819137755, 0.0950000000000003, 0.09454736842105267, 0.09443037974683553, 0.09428571428571442, 0.09363445378151264, 0.09403174603174616, 0.09343157894736842]
[0.1004621848739497, 0.09863301787592017, 0.09697478991596674, 0.09551578947368422, 0.09548523206751074, 0.09512605042016829, 0.0946218487394958, 0.09449735449735455, 0.09370526315789472]
[0.10439075630252109, 0.10113564668769717, 0.09871848739495834, 0.09656842105263162, 0.09626582278481025, 0.09554621848739513, 0.0945588235294118, 0.09473015873015893, 0.09412631578947367]
[0.11165966386554634, 0.10462670872765521, 0.10113445378151284, 0.09892631578947367, 0.09719409282700435, 0.09649159663865561, 0.09512605042016808, 0.09498412698412713, 0.09399999999999996]
[0.11649159663865563, 0.1091272344900107, 0.104432773