In [19]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import kendalltau
from numpy import count_nonzero

dir_ = '../data/'
group_dir_ = '../data/groups/low'

In [20]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.csv'
pop_count = pd.read_pickle(os.path.join(dir_, 'prediction_popularity_count_top_N_' + file_name[:-3] + 'pkl'))
pop_rating = pd.read_pickle(os.path.join(dir_, 'prediction_popularity_rating_top_N_' + file_name[:-3] + 'pkl'))

num_user = len(svd['uid'].unique())
num_user

220

In [21]:
groups_n =[]
group_sizes = ['2', '3', '4', '5']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, 'group_' + group_sizes[i] + '.csv'), delimiter=',')
    groups_n.append(groups)

In [22]:
for i in groups_n:
    print(len(i[0]))

2
3
4
5


In [23]:
svd['count'] = 1
svd

Unnamed: 0,uid,tid,rating,count
0,0.0,0.0,2.493650,1
1,0.0,6.0,2.333071,1
2,0.0,8.0,2.713340,1
3,0.0,9.0,1.949588,1
4,0.0,12.0,2.477737,1
...,...,...,...,...
11741994,219.0,54964.0,1.816855,1
11741995,219.0,54965.0,2.315346,1
11741996,219.0,54966.0,1.954432,1
11741997,219.0,54967.0,2.071366,1


In [24]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count

Unnamed: 0,tid,count,rating
166,166,1882,333.50
6310,6338,1589,92.25
17316,17377,1555,43.50
6254,6282,1396,17.50
13055,13104,1340,33.50
...,...,...,...
25819,25941,1,0.25
43440,43743,1,0.25
54130,54607,1,0.25
27486,27620,1,0.25


In [25]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized

Unnamed: 0,tid,count,rating
166,166,1882,5.000000
6310,6338,1589,4.221573
17316,17377,1555,4.131243
6254,6282,1396,3.708820
13055,13104,1340,3.560043
...,...,...,...
25819,25941,1,0.002657
43440,43743,1,0.002657
54130,54607,1,0.002657
27486,27620,1,0.002657


In [26]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [27]:
top_n_size = 100
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)
print(truth_rank)
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
            rating_table = rating_table[['tid', 'rating']]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    print('lambda = ' + str(lambda_))
    threshold = 0.0
    for i in range(len(groups_n)):
        groups = groups_n[i]
        top_n_lists = top_n_lists_g[i]

        taus = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            rating_table = pd.DataFrame() 
            for member in group:
                average = test[test['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = average
                else:
                    rating_table = rating_table.set_index('tid').add(average.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= len(group) 
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            
            ground_truth = []
            for _, row in rating_table.iterrows():
                ground_truth.append(row[0])
            
            pred_rank = []
            for k in top_n_list:
                if k in ground_truth:
                    pred_rank.append(ground_truth.index(k)+1)
                else:
                    pred_rank.append(top_n_size+1)
            tau, p_value = kendalltau(truth_rank, pred_rank)
            taus.append(tau)

        n_tau = 0
        avg_tau = 0
        for tau in taus:
            if not np.isnan(tau):
                avg_tau += tau
                n_tau += 1
        avg_tau /= n_tau
        print(avg_tau)
    print('----------------------------------------') 

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

lambda = 0.0
0.04408285630235977
0.05756128381213318
0.062376704794729676
0.09712605628045089
----------------------------------------
lambda = 0.1
0.035577656127907996
0.048822329145669134
0.046705480226685435
0.09470513445202959
----------------------------------------
lambda = 0.2
0.036990034555785
0.049954702364634986
0.050278430405822985
0.09219133206623747
----------------------------------------
lambda = 0.3
0.04167985371554875
0.046783131050247256
0.05542797714126317
0.08603033095987603
----------------------------------------
lambda = 0.4
0.042141891747457624
0.03770698124174269
0.05755171469275545
0.0825448324428534
----------------------------------------
lambda = 0.5
0.062041249480133245
0.048635356562937465
0.07726933165342498
0.08712269808745547
----------------------------------------
lambda = 0.6
0.07609492938488062
0.06675399105136062
0.09921710262033934
0.06099919051870902
----------------------------------------
lambda = 0.7
0.08734038018514595
0.08305011967441472
0.