In [54]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import kendalltau
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split
from numpy import count_nonzero

dir_ = '../data/'
group_dir_ = '../data/groups/low'

In [55]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.csv'
pop_count = pd.read_pickle(os.path.join(dir_, 'prediction_popularity_count_top_N_' + file_name[:-3] + 'pkl'))
pop_rating = pd.read_pickle(os.path.join(dir_, 'prediction_popularity_rating_top_N_' + file_name[:-3] + 'pkl'))

num_user = len(svd['uid'].unique())
num_user

220

In [56]:
groups_n_train = []
groups_n_test = []
group_sizes = ['2', '3', '4', '5']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, 'group_' + group_sizes[i] + '.csv'), delimiter=',')
    num_groups = len(groups)
    groups_part1 =  groups[:int(num_groups*0.2)]
    groups_part2 =  groups[int(num_groups*0.2):int(num_groups*0.4)]
    groups_part3 =  groups[int(num_groups*0.4):int(num_groups*0.6)]
    groups_part4 =  groups[int(num_groups*0.6):int(num_groups*0.8)]
    groups_part5 =  groups[int(num_groups*0.8):]
    groups_train = np.concatenate((groups_part2, groups_part3, groups_part4, groups_part5))
    groups_test = groups_part1
    print(len(groups_train), len(groups_test))
    groups_n_train.append(groups_train)
    groups_n_test.append(groups_test)

68 16
28 7
17 4
13 3


In [57]:
for i in groups_n_train:
    print(len(i[0]))

2
3
4
5


In [58]:
svd['count'] = 1
svd

Unnamed: 0,uid,tid,rating,count
0,0.0,0.0,2.493650,1
1,0.0,6.0,2.333071,1
2,0.0,8.0,2.713340,1
3,0.0,9.0,1.949588,1
4,0.0,12.0,2.477737,1
...,...,...,...,...
11741994,219.0,54964.0,1.816855,1
11741995,219.0,54965.0,2.315346,1
11741996,219.0,54966.0,1.954432,1
11741997,219.0,54967.0,2.071366,1


In [59]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count

Unnamed: 0,tid,count,rating
166,166,1882,333.50
6310,6338,1589,92.25
17316,17377,1555,43.50
6254,6282,1396,17.50
13055,13104,1340,33.50
...,...,...,...
25819,25941,1,0.25
43440,43743,1,0.25
54130,54607,1,0.25
27486,27620,1,0.25


In [60]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized

Unnamed: 0,tid,count,rating
166,166,1882,5.000000
6310,6338,1589,4.221573
17316,17377,1555,4.131243
6254,6282,1396,3.708820
13055,13104,1340,3.560043
...,...,...,...
25819,25941,1,0.002657
43440,43743,1,0.002657
54130,54607,1,0.002657
27486,27620,1,0.002657


In [61]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [62]:
top_n_size = 100
results_precision = []
for i in range(len(groups_n_train)):
    results_precision.append([])
    
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)

In [63]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_train:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_train)):
        groups = groups_n_train[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
        
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        
        results_precision[i].append(avg_precision)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [64]:
avg_precision = np.zeros(11)
for i in results_precision:
    avg_precision = [a + b for a, b in zip(avg_precision, i)]
avg_precision = [a/len(groups_n_train) for a in avg_precision]
avg_precision

[0.2819731738849386,
 0.29379444085326434,
 0.3039313994828701,
 0.3136651583710407,
 0.32372535552682613,
 0.329793956043956,
 0.3371230607627666,
 0.33417784421460894,
 0.3142368293471235,
 0.2619158047834519,
 0.18582942792501617]

In [65]:
lambda_ = 0.6

In [66]:
top_n_size = 100
top_n_lists_g_hybrid = []
top_n_lists_g_pop = []
top_n_lists_g_svd = []

for groups in groups_n_test:

    group_size = len(groups[0])
    top_n_lists_hybrid = []
    top_n_lists_pop = []
    top_n_lists_svd = []

    for group in groups:

        predictions = []
        rating_table = pd.DataFrame() 
        for member in group:
            prediction = svd[svd['uid'] == member].copy()
            if len(rating_table) == 0:
                rating_table = prediction
            else:
                rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            predictions.append(prediction)
        rating_table['rating'] /= group_size 
        rating_table['rating'] *= lambda_
        rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
        
        top_n_list_svd = []
        for _, row in rating_table[:top_n_size].iterrows():
            top_n_list_svd.append(row[0])
        
        pop = pop_normalized.copy()
        
        top_n_list_pop = []
        for _, row in pop[:top_n_size].iterrows():
            top_n_list_pop.append(row[0])
            
        pop['rating'] *= (1 - lambda_)
        rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
#         rating_table = rating_table[rating_table['count'] == group_size]
        rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
        rating_table = rating_table[:top_n_size]
        
        top_n_list_hybrid = []
        for _, row in rating_table.iterrows():
            top_n_list_hybrid.append(row[0])


            
        top_n_lists_hybrid.append(top_n_list_hybrid)
        top_n_lists_pop.append(top_n_list_pop)
        top_n_lists_svd.append(top_n_list_svd)
        
    top_n_lists_g_hybrid.append(top_n_lists_hybrid)
    top_n_lists_g_pop.append(top_n_lists_pop)
    top_n_lists_g_svd.append(top_n_lists_svd)

In [67]:
threshold = 0.0
cdfs_hybrid = []
for i in range(len(groups_n_test)):
    groups = groups_n_test[i]
    top_n_lists_hybrid = top_n_lists_g_hybrid[i]

    cdf_n = np.zeros(top_n_size)
    precisions = []
    for j in (range(len(groups))):

        group = groups[j]
        top_n_list_hybrid = top_n_lists_hybrid[j]
        
        cdf = []
        high_rating = 0
        
        for k in range(top_n_size):
            for l in range(len(group)):
                uid = group[l]
                tid = top_n_list_hybrid[k]
                t = test[test['uid'] == uid]
                t = t[t['tid'] == tid]
                if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                    high_rating += 1
                    break
            cdf.append(high_rating)
        cdf_n = [a + b for a, b in zip(cdf_n, cdf)]
        precision = high_rating / top_n_size
        precisions.append(precision)
        
    cdf_n = [i/len(groups) for i in cdf_n]
    cdfs_hybrid.append(cdf_n)

    avg_precision = 0
    for precision in precisions:
        avg_precision += precision
    avg_precision /= len(precisions)
    print(avg_precision)

0.5225
0.4928571428571429
0.495
0.39999999999999997


In [68]:
r = np.zeros(100)
for cdf in cdfs_hybrid:
    r = [a + b for a, b in zip(r, cdf)]
r = [i/len(groups_n_test) for i in r]
r

[0.6011904761904762,
 1.1041666666666667,
 1.5758928571428572,
 1.8392857142857144,
 2.1145833333333335,
 2.6220238095238093,
 3.2797619047619047,
 4.011160714285714,
 4.549851190476191,
 4.973958333333333,
 5.268601190476191,
 5.646577380952381,
 6.153273809523809,
 6.661458333333333,
 7.133184523809525,
 7.541666666666667,
 8.206845238095237,
 8.616071428571429,
 9.056547619047619,
 9.497023809523808,
 9.994047619047619,
 10.523809523809524,
 11.197916666666666,
 11.887648809523808,
 12.436011904761905,
 12.735863095238095,
 13.203125,
 13.834077380952381,
 14.352678571428571,
 14.78199404761905,
 15.04613095238095,
 15.486607142857142,
 15.912202380952381,
 16.426339285714285,
 16.895833333333332,
 17.36830357142857,
 17.848214285714285,
 18.505952380952383,
 19.119047619047617,
 19.69419642857143,
 20.197172619047617,
 20.617559523809526,
 21.051339285714285,
 21.594494047619047,
 22.008184523809526,
 22.377232142857142,
 22.822172619047617,
 23.427083333333332,
 23.9546130952381,


In [69]:
threshold = 0.0
cdfs_pop = []
for i in range(len(groups_n_test)):
    groups = groups_n_test[i]
    top_n_lists_svd = top_n_lists_g_svd[i]
    
    cdf_n = np.zeros(top_n_size)
    precisions = []
    for j in (range(len(groups))):

        group = groups[j]
        top_n_list_svd = top_n_lists_svd[j]

        cdf = []
        high_rating = 0

        for k in range(top_n_size):
            for l in range(len(group)):
                uid = group[l]
                tid = top_n_list_svd[k]
                t = test[test['uid'] == uid]
                t = t[t['tid'] == tid]
                if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                    high_rating += 1
                    break
            cdf.append(high_rating)
        cdf_n = [a + b for a, b in zip(cdf_n, cdf)]
        precision = high_rating / top_n_size
        precisions.append(precision)
        
    cdf_n = [i/len(groups) for i in cdf_n]
    cdfs_pop.append(cdf_n)
    
    avg_precision = 0
    for precision in precisions:
        avg_precision += precision
    avg_precision /= len(precisions)
    print(avg_precision)

0.333125
0.33428571428571424
0.2975
0.21999999999999997


In [70]:
r = np.zeros(100)
for cdf in cdfs_pop:
    r = [a + b for a, b in zip(r, cdf)]
r = [i/len(groups_n_test) for i in r]
r

[0.5632440476190477,
 1.1376488095238095,
 1.5773809523809523,
 2.116071428571429,
 2.665922619047619,
 3.1577380952380953,
 3.6019345238095237,
 4.165922619047619,
 4.819940476190476,
 5.3430059523809526,
 5.626488095238096,
 5.993303571428571,
 6.247767857142858,
 6.78125,
 7.340029761904762,
 7.738095238095238,
 8.029017857142858,
 8.34375,
 8.638392857142858,
 8.716517857142858,
 8.976190476190476,
 9.251488095238095,
 9.649553571428571,
 10.121279761904763,
 10.425595238095237,
 10.674107142857142,
 10.98511904761905,
 11.186011904761905,
 11.335565476190476,
 11.697916666666666,
 12.055803571428571,
 12.40699404761905,
 12.681547619047619,
 12.930059523809524,
 13.272321428571429,
 13.536458333333334,
 13.914434523809524,
 14.34375,
 14.555803571428571,
 14.902529761904763,
 15.213541666666666,
 15.42113095238095,
 15.778273809523808,
 15.907738095238095,
 16.183779761904763,
 16.474702380952383,
 16.841517857142858,
 17.038690476190474,
 17.297619047619047,
 17.478422619047617,


In [71]:
threshold = 0.0
cdfs_svd = []
for i in range(len(groups_n_test)):
    groups = groups_n_test[i]
    top_n_lists_pop = top_n_lists_g_pop[i]
    
    cdf_n = np.zeros(top_n_size)
    precisions = []
    for j in (range(len(groups))):

        group = groups[j]
        top_n_list_pop = top_n_lists_pop[j]
        
        cdf = []
        high_rating = 0

        for k in range(top_n_size):
            for l in range(len(group)):
                uid = group[l]
                tid = top_n_list_pop[k]
                t = test[test['uid'] == uid]
                t = t[t['tid'] == tid]
                if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                    high_rating += 1
                    break
            cdf.append(high_rating)
        cdf_n = [a + b for a, b in zip(cdf_n, cdf)]
        precision = high_rating / top_n_size
        precisions.append(precision)
        
    cdf_n = [i/len(groups) for i in cdf_n]
    cdfs_svd.append(cdf_n)
    
    avg_precision = 0
    for precision in precisions:
        avg_precision += precision
    avg_precision /= len(precisions)
    print(avg_precision)

0.32937500000000003
0.3242857142857143
0.36750000000000005
0.3


In [72]:
r = np.zeros(100)
for cdf in cdfs_svd:
    r = [a + b for a, b in zip(r, cdf)]
r = [i/len(groups_n_test) for i in r]
r

[0.7514880952380952,
 0.9479166666666666,
 1.3526785714285714,
 1.3526785714285714,
 1.5491071428571428,
 2.1391369047619047,
 2.9583333333333335,
 3.0833333333333335,
 3.7261904761904763,
 4.115327380952381,
 4.627232142857142,
 4.823660714285714,
 5.252976190476191,
 5.521577380952381,
 5.651785714285714,
 5.997767857142858,
 6.142857142857142,
 6.158482142857142,
 6.7805059523809526,
 7.050595238095238,
 7.081845238095238,
 7.195684523809525,
 7.460565476190475,
 7.7038690476190474,
 7.962797619047619,
 8.491071428571429,
 8.723958333333334,
 9.196428571428571,
 9.425595238095237,
 9.898065476190476,
 10.349702380952381,
 10.805059523809524,
 10.872023809523808,
 11.28050595238095,
 11.398809523809524,
 11.803571428571429,
 11.834821428571429,
 12.223958333333334,
 12.338541666666666,
 12.913690476190476,
 13.197172619047619,
 13.31175595238095,
 14.037202380952381,
 14.659970238095237,
 14.887648809523808,
 15.292410714285715,
 15.748511904761905,
 15.748511904761905,
 16.105654761