In [35]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import kendalltau
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split
from numpy import count_nonzero

dir_ = '../data/'
group_dir_ = '../data/groups/random'

In [36]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.csv'
pop_count = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))

num_user = len(svd['uid'].unique())
num_user

953

In [37]:
groups_n_train = []
groups_n_test = []
group_sizes = ['2', '3', '4', '5']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, group_sizes[i] + '.csv'), delimiter=',')
    groups_train, groups_test = train_test_split(groups, test_size=0.2, random_state=np.random)
    groups_n_train.append(groups_train)
    groups_n_test.append(groups_test)

In [38]:
for i in groups_n_train:
    print(len(i[0]))

2
3
4
5


In [39]:
svd['count'] = 1
svd

Unnamed: 0,uid,tid,rating,count
0,0,9,2.243909,1
1,0,15,2.160610,1
2,0,22,2.145235,1
3,0,28,2.204445,1
4,0,33,2.166677,1
...,...,...,...,...
147697636,952,157562,2.095077,1
147697637,952,157563,2.111105,1
147697638,952,157564,2.025293,1
147697639,952,157565,2.122036,1


In [40]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count

Unnamed: 0,tid,count,rating
70,70,17557,2980.75
1102,1102,6926,1484.25
83,83,6571,1796.75
13496,13496,6299,1669.75
952,952,6234,1527.50
...,...,...,...
27645,27645,4,1.00
157522,157522,4,1.00
156498,156498,4,1.00
83068,83068,4,1.00


In [41]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized

Unnamed: 0,tid,count,rating
70,70,17557,5.000000
1102,1102,6926,1.972433
83,83,6571,1.871333
13496,13496,6299,1.793871
952,952,6234,1.775360
...,...,...,...
27645,27645,4,0.001139
157522,157522,4,0.001139
156498,156498,4,0.001139
83068,83068,4,0.001139


In [42]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [43]:
top_n_size = 100
optimal_lambdas = []
results_precision = []
for i in range(len(groups_n_train)):
    results_precision.append([])
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)

In [44]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_train:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_train)):
        groups = groups_n_train[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
        
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        
        
        results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [45]:
for i in results_precision:
    print(i)

[0.15039473684210525, 0.17307894736842108, 0.19152631578947366, 0.1987631578947369, 0.20510526315789482, 0.2064473684210527, 0.20797368421052626, 0.20900000000000005, 0.20952631578947378, 0.204236842105263, 0.18271052631578946]
[0.2191304347826085, 0.2405533596837945, 0.2578260869565217, 0.2692094861660079, 0.2746245059288536, 0.27553359683794454, 0.2750592885375496, 0.27229249011857704, 0.27106719367588933, 0.2635573122529644, 0.23889328063241108]
[0.2771578947368422, 0.29626315789473684, 0.31315789473684197, 0.32415789473684214, 0.325421052631579, 0.32399999999999995, 0.3193157894736844, 0.3157894736842107, 0.3116315789473685, 0.3042631578947368, 0.2743684210526316]
[0.332763157894737, 0.3515789473684208, 0.3663157894736841, 0.37375, 0.3758552631578947, 0.3729605263157895, 0.3613815789473687, 0.35243421052631574, 0.34125000000000005, 0.33085526315789443, 0.2917763157894739]


In [46]:
avg = []
for i in range(len(results_precision[0])):
    avg.append((results_precision[0][i] + results_precision[1][i] + results_precision[2][i] + results_precision[3][i])/4)
avg

[0.24486155606407323,
 0.2653686030788433,
 0.28220652173913036,
 0.29147013469939675,
 0.2952515212190555,
 0.29473537289369667,
 0.2909325852922822,
 0.28737904358227584,
 0.2833687721031829,
 0.27572814385271466,
 0.2469371359475765]

In [47]:
final_results_precision = []
for i in range(len(groups_n_test)):
    final_results_precision.append([])

In [48]:
print(lambdas)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [49]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_test:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_test)):
        groups = groups_n_test[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
    
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)

        final_results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [50]:
for i in final_results_precision:
    print(i)

[0.14770833333333336, 0.16552083333333334, 0.18302083333333333, 0.1930208333333334, 0.1970833333333333, 0.19895833333333338, 0.2003125000000001, 0.2007291666666667, 0.20072916666666663, 0.1976041666666667, 0.17489583333333333]
[0.20093750000000007, 0.21640625000000005, 0.22890625000000006, 0.23812500000000003, 0.24218750000000006, 0.24171875000000004, 0.23921874999999998, 0.23718749999999997, 0.23687499999999995, 0.23265625000000004, 0.20484374999999994]
[0.26333333333333336, 0.279375, 0.29187500000000005, 0.2925, 0.29854166666666676, 0.2964583333333333, 0.28645833333333337, 0.27854166666666663, 0.2733333333333334, 0.2666666666666667, 0.2427083333333334]
[0.32368421052631585, 0.33473684210526305, 0.3468421052631579, 0.3539473684210526, 0.3513157894736842, 0.34526315789473677, 0.33421052631578946, 0.3263157894736842, 0.3163157894736843, 0.3115789473684211, 0.28236842105263155]


In [51]:
avg = []
for i in range(len(final_results_precision[0])):
    avg.append((final_results_precision[0][i] + final_results_precision[1][i] + final_results_precision[2][i] + final_results_precision[3][i])/4)
avg

[0.23391584429824563,
 0.2490097313596491,
 0.2626610471491228,
 0.2693983004385965,
 0.2722820723684211,
 0.2705996436403509,
 0.26505002741228073,
 0.2606935307017544,
 0.2568133223684211,
 0.25212650767543865,
 0.22620408442982454]