In [36]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import kendalltau
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split
from numpy import count_nonzero

dir_ = '../data/'
group_dir_ = '../data/groups/low'

In [37]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.csv'
pop_count = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))

num_user = len(svd['uid'].unique())
num_user

953

In [38]:
groups_n_train = []
groups_n_test = []
group_sizes = ['2', '3', '4', '5']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, group_sizes[i] + '.csv'), delimiter=',')
    groups_train, groups_test = train_test_split(groups, test_size=0.2, random_state=np.random)
    groups_n_train.append(groups_train)
    groups_n_test.append(groups_test)

In [39]:
for i in groups_n_train:
    print(len(i[0]))

2
3
4
5


In [40]:
svd['count'] = 1
svd

Unnamed: 0,uid,tid,rating,count
0,0,9,2.243909,1
1,0,15,2.160610,1
2,0,22,2.145235,1
3,0,28,2.204445,1
4,0,33,2.166677,1
...,...,...,...,...
147697636,952,157562,2.095077,1
147697637,952,157563,2.111105,1
147697638,952,157564,2.025293,1
147697639,952,157565,2.122036,1


In [41]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count

Unnamed: 0,tid,count,rating
70,70,17557,2980.75
1102,1102,6926,1484.25
83,83,6571,1796.75
13496,13496,6299,1669.75
952,952,6234,1527.50
...,...,...,...
27645,27645,4,1.00
157522,157522,4,1.00
156498,156498,4,1.00
83068,83068,4,1.00


In [42]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized

Unnamed: 0,tid,count,rating
70,70,17557,5.000000
1102,1102,6926,1.972433
83,83,6571,1.871333
13496,13496,6299,1.793871
952,952,6234,1.775360
...,...,...,...
27645,27645,4,0.001139
157522,157522,4,0.001139
156498,156498,4,0.001139
83068,83068,4,0.001139


In [43]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [44]:
top_n_size = 100
optimal_lambdas = []
results_precision = []
for i in range(len(groups_n_train)):
    results_precision.append([])
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)

In [45]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_train:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_train)):
        groups = groups_n_train[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
        
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        
        
        results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [46]:
for i in results_precision:
    print(i)

[0.13541666666666674, 0.15687499999999985, 0.1760416666666667, 0.19031250000000008, 0.19791666666666666, 0.20361111111111116, 0.20670138888888887, 0.2085069444444444, 0.20920138888888876, 0.20666666666666664, 0.183576388888889]
[0.1554330708661417, 0.1738582677165355, 0.18921259842519694, 0.20251968503937012, 0.21464566929133866, 0.22220472440944877, 0.22677165354330714, 0.2290551181102363, 0.22818897637795277, 0.22566929133858274, 0.2027559055118112]
[0.18103896103896108, 0.1932467532467532, 0.20935064935064931, 0.22077922077922088, 0.22714285714285717, 0.23883116883116892, 0.24298701298701314, 0.24584415584415584, 0.24597402597402604, 0.2446753246753247, 0.21935064935064935]
[0.19615384615384615, 0.20788461538461533, 0.2194230769230769, 0.23307692307692313, 0.24269230769230765, 0.2519230769230769, 0.25961538461538464, 0.2621153846153847, 0.26057692307692315, 0.25538461538461543, 0.23519230769230767]


In [47]:
avg = []
for i in range(len(results_precision[0])):
    avg.append((results_precision[0][i] + results_precision[1][i] + results_precision[2][i] + results_precision[3][i])/4)
avg

[0.1670106361814039,
 0.18296615908697597,
 0.19850699784139747,
 0.21167208222387857,
 0.22059937519829254,
 0.22914252031870144,
 0.2340188600086484,
 0.23638040075355532,
 0.23598532857944768,
 0.23309897451629738,
 0.21021881286091432]

In [48]:
final_results_precision = []
for i in range(len(groups_n_test)):
    final_results_precision.append([])

In [49]:
print(lambdas)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [50]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_test:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_test)):
        groups = groups_n_test[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
    
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)

        final_results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [51]:
for i in final_results_precision:
    print(i)

[0.13520547945205483, 0.1556164383561644, 0.16986301369863016, 0.18219178082191775, 0.1878082191780822, 0.19205479452054802, 0.19383561643835617, 0.19410958904109588, 0.19575342465753426, 0.19095890410958904, 0.16931506849315067]
[0.16343749999999999, 0.17906250000000007, 0.1971875, 0.20874999999999996, 0.22562500000000008, 0.23812500000000003, 0.24437500000000006, 0.24968750000000006, 0.24968749999999998, 0.2440625000000001, 0.22375]
[0.17850000000000002, 0.1855, 0.19649999999999998, 0.2145, 0.225, 0.23350000000000004, 0.23399999999999999, 0.23599999999999993, 0.2315000000000001, 0.2305, 0.20800000000000002]
[0.22071428571428567, 0.22571428571428573, 0.2335714285714286, 0.2442857142857143, 0.24785714285714283, 0.2542857142857143, 0.25499999999999995, 0.2578571428571429, 0.2621428571428571, 0.26, 0.23642857142857146]


In [52]:
avg = []
for i in range(len(final_results_precision[0])):
    avg.append((final_results_precision[0][i] + final_results_precision[1][i] + final_results_precision[2][i] + final_results_precision[3][i])/4)
avg

[0.17446431629158513,
 0.18647330601761256,
 0.1992804855675147,
 0.212431873776908,
 0.2215725905088063,
 0.2294913772015656,
 0.23180265410958906,
 0.23441355797455968,
 0.23477094545009788,
 0.2313803510273973,
 0.20937340998043052]