In [1]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from numpy import count_nonzero

dir_ = '../../../data/'
group_dir_ = '../../../data/groups/high'

In [2]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
df = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

num_user = len(svd['uid'].unique())
num_user

953

In [3]:
groups_n =[]
group_sizes = ['2', '3', '4', '5', '6', '7', '8', '9', '10']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, group_sizes[i] + '.csv'), delimiter=',')
    groups_n.append(groups)

In [4]:
svd['count'] = 1
svd[:5]

Unnamed: 0,uid,tid,rating,count
0,0,9,2.243909,1
1,0,15,2.16061,1
2,0,22,2.145235,1
3,0,28,2.204445,1
4,0,33,2.166677,1


In [5]:
# Count POP to DF
tid_list = []
pop_list = []
for i in df['tid'].unique():
    tid_list.append(i)
    pop_list.append(len(df[df['tid']==i])/num_user*5)

In [6]:
d = {'tid': tid_list, 'rating': pop_list}
df_pop = pd.DataFrame(data=d)
df_pop = df_pop.sort_values(by=['rating'], ascending=False)
df_pop[:10]

Unnamed: 0,tid,rating
70,70,4.034627
5521,5521,2.911857
390,390,2.822665
2519,2519,2.801679
83,83,2.796432
13496,13496,2.785939
210,210,2.686254
5716,5716,2.681007
1036,1036,2.544596
1464,1464,2.528856


In [None]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

In [7]:
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    for i in range(len(r)):
        r[i] = 2**r[i] -1
    
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, r_max, k, method=0):
    dcg_max = dcg_at_k(r_max, k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [9]:
precision_list = []
recall_list = []
nDCG_list = []
for lambda_ in tqdm(lambdas):
    top_n_size = 50
    top_n_lists_g = []
    for groups in groups_n:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = df_pop.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    avg_precisions = []
    avg_recalls = []
    avg_nDCGs = []
    for i in range(len(groups_n)):
        groups = groups_n[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        recalls = []
        nDCGs = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]
            
            for k in range(len(group)):
                uid = group[k]
                high_rating = 0
                truth_rating = [] # For nDCG
                for l in range(top_n_size):
                    tid = top_n_list[l]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0:
                        high_rating += 1
                        truth_rating.append(t.iloc[0]['rating']) # For nDCG
                    else:
                        truth_rating.append(0) # For nDCG

                precision = high_rating / top_n_size
                recall = high_rating / len(test[test['uid'] == uid])
                
                max_rating = test[test['uid']==uid].sort_values(by=['rating'],  ascending=False)['rating'].values[:top_n_size]
                nDCG = ndcg_at_k(truth_rating, max_rating, top_n_size, method=1)
                
                precisions.append(precision)
                recalls.append(recall)
                nDCGs.append(nDCG)  

        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        avg_precisions.append(avg_precision)

        avg_recall = 0
        for recall in recalls:
            avg_recall += recall
        avg_recall /= len(recalls)
        avg_recalls.append(avg_recall)
        
        avg_nDCG = 0
        for nDCG in nDCGs:
            avg_nDCG += nDCG
        avg_nDCG /= len(nDCGs)
        avg_nDCGs.append(avg_nDCG)
        
    precision_list.append(avg_precisions)
    recall_list.append(avg_recalls)
    nDCG_list.append(avg_nDCGs)

for p in precision_list:
    print(p)
print()
for r in recall_list:
    print(r)
print()   
for n in nDCG_list:
    print(n)
print()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


[0.1255773955773955, 0.10798874824191318, 0.09216666666666672, 0.08022222222222217, 0.06219512195121919, 0.05545454545454518, 0.04971698113207528, 0.041580246913580095, 0.039179487179487056]

[0.008923029249108093, 0.00713516507185865, 0.005828097585416064, 0.0047412686236399295, 0.003675951329076705, 0.0032813529456451724, 0.002836625211417923, 0.002205734783444314, 0.002170881971036246]

[0.06696365670669385, 0.053203369571313906, 0.04243476728667007, 0.03548869089235441, 0.026470446101604452, 0.022097767566098516, 0.019579529430171636, 0.017851788459656064, 0.016155284669107182]

