In [1]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import kendalltau
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split
from numpy import count_nonzero

dir_ = '../data/'
group_dir_ = '../data/groups/high'

In [2]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.csv'
pop_count = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))

num_user = len(svd['uid'].unique())
num_user

220

In [3]:
groups_n_train = []
groups_n_test = []
group_sizes = ['2', '3', '4', '5']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, group_sizes[i] + '.csv'), delimiter=',')
    groups_train, groups_test = train_test_split(groups, test_size=0.2, random_state=np.random)
    groups_n_train.append(groups_train)
    groups_n_test.append(groups_test)

In [4]:
for i in groups_n_train:
    print(len(i[0]))

2
3
4
5


In [5]:
svd['count'] = 1
svd

Unnamed: 0,uid,tid,rating,count
0,0,3,2.544071,1
1,0,7,2.545822,1
2,0,18,2.152490,1
3,0,20,1.882396,1
4,0,31,2.410806,1
...,...,...,...,...
11531286,219,54964,2.556438,1
11531287,219,54965,2.513008,1
11531288,219,54966,2.185796,1
11531289,219,54967,2.193284,1


In [6]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count

Unnamed: 0,tid,count,rating
166,166,3666,573.25
457,457,2042,162.00
6338,6338,1896,153.75
80,80,1835,412.25
1364,1364,1792,327.50
...,...,...,...
37553,37554,1,0.25
42281,42283,1,0.25
11800,11800,1,0.25
45731,45733,1,0.25


In [7]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized

Unnamed: 0,tid,count,rating
166,166,3666,5.000000
457,457,2042,2.785052
6338,6338,1896,2.585925
80,80,1835,2.502728
1364,1364,1792,2.444081
...,...,...,...
37553,37554,1,0.001364
42281,42283,1,0.001364
11800,11800,1,0.001364
45731,45733,1,0.001364


In [8]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [9]:
top_n_size = 4
optimal_lambdas = []
results_precision = []
for i in range(len(groups_n_train)):
    results_precision.append([])
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)

In [10]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_train:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_train)):
        groups = groups_n_train[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
        
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        
        
        results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [11]:
for i in results_precision:
    print(i)

[0.16785714285714284, 0.2, 0.22142857142857142, 0.26785714285714285, 0.29642857142857143, 0.3464285714285714, 0.4, 0.42142857142857143, 0.46785714285714286, 0.4357142857142857, 0.38571428571428573]
[0.2625, 0.26875, 0.2875, 0.3125, 0.35625, 0.40625, 0.49375, 0.54375, 0.525, 0.53125, 0.45625]
[0.3958333333333333, 0.46875, 0.46875, 0.5104166666666666, 0.4895833333333333, 0.5208333333333334, 0.6458333333333334, 0.65625, 0.59375, 0.5729166666666666, 0.5520833333333334]
[0.453125, 0.46875, 0.46875, 0.484375, 0.5, 0.484375, 0.5, 0.640625, 0.625, 0.59375, 0.53125]


In [22]:
avg = []
for i in range(len(results_precision[0])):
    avg.append((results_precision[0][i] + results_precision[1][i] + results_precision[2][i] + results_precision[3][i])/4)
avg

[0.31982886904761904,
 0.3515625,
 0.36160714285714285,
 0.39378720238095233,
 0.4105654761904762,
 0.4394717261904762,
 0.5098958333333333,
 0.5655133928571429,
 0.5529017857142857,
 0.5334077380952381,
 0.48132440476190474]

In [25]:
final_results_precision = []
for i in range(len(groups_n_test)):
    final_results_precision.append([])

In [26]:
print(lambdas)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [27]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_test:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_test)):
        groups = groups_n_test[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
    
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)

        final_results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [28]:
for i in final_results_precision:
    print(i)

[0.20833333333333334, 0.2361111111111111, 0.2222222222222222, 0.25, 0.3055555555555556, 0.3055555555555556, 0.3888888888888889, 0.4166666666666667, 0.4444444444444444, 0.4583333333333333, 0.4027777777777778]
[0.275, 0.325, 0.3, 0.35, 0.375, 0.45, 0.625, 0.625, 0.675, 0.675, 0.575]
[0.2857142857142857, 0.2857142857142857, 0.2857142857142857, 0.32142857142857145, 0.32142857142857145, 0.35714285714285715, 0.39285714285714285, 0.5, 0.4642857142857143, 0.42857142857142855, 0.5]
[0.35, 0.35, 0.35, 0.35, 0.4, 0.35, 0.35, 0.3, 0.35, 0.25, 0.2]


In [29]:
avg = []
for i in range(len(final_results_precision[0])):
    avg.append((final_results_precision[0][i] + final_results_precision[1][i] + final_results_precision[2][i] + final_results_precision[3][i])/4)
avg

[0.27976190476190477,
 0.2992063492063492,
 0.28948412698412695,
 0.31785714285714284,
 0.3504960317460317,
 0.3656746031746032,
 0.43918650793650793,
 0.4604166666666667,
 0.48343253968253974,
 0.4529761904761905,
 0.41944444444444445]