In [150]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import kendalltau
from scipy.stats import rankdata
from sklearn.model_selection import train_test_split
from numpy import count_nonzero

dir_ = '../data/'
group_dir_ = '../data/groups/high'

In [151]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

file_name = 'normalized_popularity_filter_track_5_user_100.csv'
pop_count = pd.read_pickle(os.path.join(dir_, 'prediction_popularity_count_top_N_' + file_name[:-3] + 'pkl'))
pop_rating = pd.read_pickle(os.path.join(dir_, 'prediction_popularity_rating_top_N_' + file_name[:-3] + 'pkl'))

num_user = len(svd['uid'].unique())
num_user

220

In [152]:
groups_n_train = []
groups_n_test = []
group_sizes = ['2', '3', '4', '5']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, 'group_' + group_sizes[i] + '.csv'), delimiter=',')
    groups_train, groups_test = train_test_split(groups, test_size=0.2, random_state=np.random)
    groups_n_train.append(groups_train)
    groups_n_test.append(groups_test)

In [153]:
for i in groups_n_train:
    print(len(i[0]))

2
3
4
5


In [154]:
svd['count'] = 1
svd

Unnamed: 0,uid,tid,rating,count
0,0.0,0.0,2.493650,1
1,0.0,6.0,2.333071,1
2,0.0,8.0,2.713340,1
3,0.0,9.0,1.949588,1
4,0.0,12.0,2.477737,1
...,...,...,...,...
11741994,219.0,54964.0,1.816855,1
11741995,219.0,54965.0,2.315346,1
11741996,219.0,54966.0,1.954432,1
11741997,219.0,54967.0,2.071366,1


In [155]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count

Unnamed: 0,tid,count,rating
166,166,1882,333.50
6310,6338,1589,92.25
17316,17377,1555,43.50
6254,6282,1396,17.50
13055,13104,1340,33.50
...,...,...,...
25819,25941,1,0.25
43440,43743,1,0.25
54130,54607,1,0.25
27486,27620,1,0.25


In [156]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized

Unnamed: 0,tid,count,rating
166,166,1882,5.000000
6310,6338,1589,4.221573
17316,17377,1555,4.131243
6254,6282,1396,3.708820
13055,13104,1340,3.560043
...,...,...,...
25819,25941,1,0.002657
43440,43743,1,0.002657
54130,54607,1,0.002657
27486,27620,1,0.002657


In [157]:
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    for i in range(len(r)):
        r[i] = 2**r[i] -1
    
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [158]:
lambdas = []
for i in range(11):
    lambdas.append(i/10)
lambdas

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [159]:
top_n_size = 20
optimal_lambdas = []
results_precision = []
results_satisfaction = []
results_tau = []
results_DCG = []
results_nDCG = []
for i in range(len(groups_n_train)):
    results_tau.append([])
    results_nDCG.append([])
    results_precision.append([])
    results_DCG.append([])
    results_satisfaction.append([])
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)

In [160]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_train:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_train)):
        groups = groups_n_train[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
        
        satisfactions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_ratings_t = np.zeros(top_n_size)
            high_ratings_u = np.zeros(len(group))

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_ratings_t[k] += 1
                        high_ratings_u[l] += 1

            satisfaction_t = 0
            for k in range(top_n_size):
                satisfaction_t += math.log(high_ratings_t[k]+1, 10) / math.log(len(group)+1, 10)
            satisfaction_t /= top_n_size

            satisfaction_u = 0
            for k in range(len(group)):
                satisfaction_u += math.log(high_ratings_u[k]+1, 10) / math.log(top_n_size+1, 10)
            satisfaction_u /= len(group)

            satisfaction = 0.5*satisfaction_t + (1-0.5)*satisfaction_u
            satisfactions.append(satisfaction)
        
        taus = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            rating_table = pd.DataFrame() 
            for member in group:
                average = test[test['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = average
                else:
                    rating_table = rating_table.set_index('tid').add(average.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= len(group) 
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            
            ground_truth = []
            for _, row in rating_table.iterrows():
                ground_truth.append(row[0])
            
            pred_rank = []
            for k in top_n_list:
                if k in ground_truth:
                    pred_rank.append(ground_truth.index(k)+1)
                else:
                    pred_rank.append(top_n_size+1)
            tau, p_value = kendalltau(truth_rank, pred_rank)
            taus.append(tau)

        nDCGs = []
        DCGs = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            rating_table = pd.DataFrame() 
            for member in group:
                average = test[test['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = average
                else:
                    rating_table = rating_table.set_index('tid').add(average.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= len(group) 
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            
            truth_rating = []
            for k in top_n_list:
                r = rating_table[rating_table['tid']==k]
                if len(r) > 0:
                    truth_rating.append(r.iloc[0]['rating'])
                else:
                    truth_rating.append(0)
            nDCG = ndcg_at_k(truth_rating, top_n_size, method=1)
            DCG = dcg_at_k(truth_rating, top_n_size, method=1)
            nDCGs.append(nDCG)
            DCGs.append(DCG)
            

    
        avg_satisfaction = 0
        for satisfaction in satisfactions:
            avg_satisfaction += satisfaction
        avg_satisfaction /= len(satisfactions)
    
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        
        n_tau = 0
        avg_tau = 0
        for tau in taus:
            if not np.isnan(tau):
                avg_tau += tau
                n_tau += 1
        avg_tau /= n_tau
        
        avg_nDCG = 0
        for nDCG in nDCGs:
            avg_nDCG += nDCG
        avg_nDCG /= len(nDCGs)      
        
        avg_DCG = 0
        for DCG in DCGs:
            avg_DCG += DCG
        avg_DCG /= len(DCGs)    
        
        results_precision[i].append(avg_precision)
        results_satisfaction[i].append(avg_satisfaction)
        results_tau[i].append(avg_tau)
        results_nDCG[i].append(avg_nDCG)
        results_DCG[i].append(avg_DCG)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [161]:
for i in results_precision:
    print(i)
print('-------------------------------')
for i in results_satisfaction:
    print(i)

[0.3280303030303031, 0.3181818181818182, 0.32954545454545453, 0.3598484848484849, 0.3825757575757577, 0.4227272727272728, 0.4583333333333333, 0.4795454545454546, 0.4560606060606061, 0.37803030303030294, 0.2992424242424241]
[0.4954545454545455, 0.48484848484848486, 0.48030303030303023, 0.49848484848484836, 0.540909090909091, 0.5606060606060606, 0.5863636363636365, 0.6045454545454545, 0.5848484848484848, 0.5303030303030303, 0.446969696969697]
[0.5375000000000001, 0.53, 0.5225000000000001, 0.5549999999999999, 0.5725, 0.6100000000000001, 0.6399999999999999, 0.6625000000000001, 0.63, 0.5850000000000001, 0.5125000000000001]
[0.7071428571428573, 0.6928571428571428, 0.6785714285714286, 0.6964285714285714, 0.7107142857142857, 0.7142857142857143, 0.7321428571428571, 0.75, 0.7500000000000001, 0.675, 0.6142857142857144]
-------------------------------
[0.34796278088542754, 0.3405667586679932, 0.3496548913717351, 0.3730111487532514, 0.3900059667419919, 0.4190024447186593, 0.44372497358104, 0.456625

In [162]:
ranks_precision = []
ranks_tau = []
ranks_DCG = []
ranks_nDCG = []
ranks_satisfaction = []

In [163]:
for i in results_precision:
    ranks_precision.append(rankdata(i))
for i in results_tau:
    ranks_tau.append(rankdata(i))
for i in results_DCG:
    ranks_DCG.append(rankdata(i))
for i in results_nDCG:
    ranks_nDCG.append(rankdata(i))
for i in results_satisfaction:
    ranks_satisfaction.append(rankdata(i))

In [164]:
optimized_lambdas = []
for i in range(len(groups_n_test)):
    print(ranks_satisfaction[i])
    print('--------------------------------------------------')
    print(ranks_precision[i])
#     print(ranks_tau[i])
#     print(ranks_DCG[i])
#     print(ranks_nDCG[i])
#     print(ranks_precision[i] + ranks_tau[i] + ranks_DCG[i] + ranks_nDCG[i])
    ranks = (ranks_precision[i] + ranks_tau[i] + ranks_DCG[i] + ranks_nDCG[i]).tolist()
    print(ranks.index(max(ranks)))
    print('--------------------------------------------------')
#     optimized_lambdas.append()

[ 3.  2.  4.  6.  7.  8. 10. 11.  9.  5.  1.]
--------------------------------------------------
[ 3.  2.  4.  5.  7.  8. 10. 11.  9.  6.  1.]
7
--------------------------------------------------
[ 4.  2.  3.  5.  7.  8. 10. 11.  9.  6.  1.]
--------------------------------------------------
[ 4.  3.  2.  5.  7.  8. 10. 11.  9.  6.  1.]
8
--------------------------------------------------
[ 4.  3.  2.  5.  7.  8. 10. 11.  9.  6.  1.]
--------------------------------------------------
[ 4.  3.  2.  5.  6.  8. 10. 11.  9.  7.  1.]
8
--------------------------------------------------
[ 5.  4.  2.  6.  7.  8. 10. 11.  9.  3.  1.]
--------------------------------------------------
[ 6.  4.  3.  5.  7.  8.  9. 10. 11.  2.  1.]
8
--------------------------------------------------


In [165]:
final_results_precision = []
final_results_tau = []
final_results_DCG = []
final_results_nDCG = []
for i in range(len(groups_n_test)):
    final_results_tau.append([])
    final_results_nDCG.append([])
    final_results_precision.append([])
    final_results_DCG.append([])
truth_rank = []
for i in range(top_n_size):
    truth_rank.append(i+1)

In [166]:
print(lambdas)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [167]:
for lambda_ in tqdm(lambdas):
    top_n_lists_g = []
    for groups in groups_n_test:

        group_size = len(groups[0])
        top_n_lists = []

        for group in groups:
            rating_table = pd.DataFrame() 
            for member in group:
                prediction = svd[svd['uid'] == member].copy()
                if len(rating_table) == 0:
                    rating_table = prediction
                else:
                    rating_table = rating_table.set_index('tid').add(prediction.set_index('tid'), fill_value=0).reset_index()
            rating_table['rating'] /= group_size 
            rating_table['rating'] *= lambda_
            pop = pop_normalized.copy()
            pop['rating'] *= (1 - lambda_)
            rating_table = rating_table.set_index('tid').add(pop.set_index('tid'), fill_value=0).reset_index()
    #         rating_table = rating_table[rating_table['count'] == group_size]
            rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            rating_table = rating_table[:top_n_size]
            top_n_list = []
            for _, row in rating_table.iterrows():
                top_n_list.append(row[0])
            top_n_lists.append(top_n_list)
        top_n_lists_g.append(top_n_lists)   
        
    threshold = 0.0
    for i in range(len(groups_n_test)):
        groups = groups_n_test[i]
        top_n_lists = top_n_lists_g[i]

        precisions = []
        for j in (range(len(groups))):

            group = groups[j]
            top_n_list = top_n_lists[j]

            high_rating = 0

            for k in range(top_n_size):
                for l in range(len(group)):
                    uid = group[l]
                    tid = top_n_list[k]
                    t = test[test['uid'] == uid]
                    t = t[t['tid'] == tid]
                    if len(t) > 0 and t.iloc[0]['rating'] > threshold:
                        high_rating += 1
                        break

            precision = high_rating / top_n_size
            precisions.append(precision)
        
#         taus = []
#         for j in (range(len(groups))):

#             group = groups[j]
#             top_n_list = top_n_lists[j]

#             rating_table = pd.DataFrame() 
#             for member in group:
#                 average = test[test['uid'] == member].copy()
#                 if len(rating_table) == 0:
#                     rating_table = average
#                 else:
#                     rating_table = rating_table.set_index('tid').add(average.set_index('tid'), fill_value=0).reset_index()
#             rating_table['rating'] /= len(group) 
#             rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
#             rating_table = rating_table[:top_n_size]
            
#             ground_truth = []
#             for _, row in rating_table.iterrows():
#                 ground_truth.append(row[0])
            
#             pred_rank = []
#             for k in top_n_list:
#                 if k in ground_truth:
#                     pred_rank.append(ground_truth.index(k)+1)
#                 else:
#                     pred_rank.append(top_n_size+1)
#             tau, p_value = kendalltau(truth_rank, pred_rank)
#             taus.append(tau)

#         nDCGs = []
#         DCGs = []
#         for j in (range(len(groups))):

#             group = groups[j]
#             top_n_list = top_n_lists[j]

#             rating_table = pd.DataFrame() 
#             for member in group:
#                 average = test[test['uid'] == member].copy()
#                 if len(rating_table) == 0:
#                     rating_table = average
#                 else:
#                     rating_table = rating_table.set_index('tid').add(average.set_index('tid'), fill_value=0).reset_index()
#             rating_table['rating'] /= len(group) 
#             rating_table = rating_table.sort_values(by=['rating'],  ascending=False)
            
#             truth_rating = []
#             for k in top_n_list:
#                 r = rating_table[rating_table['tid']==k]
#                 if len(r) > 0:
#                     truth_rating.append(r.iloc[0]['rating'])
#                 else:
#                     truth_rating.append(0)
#             nDCG = ndcg_at_k(truth_rating, top_n_size, method=1)
#             DCG = dcg_at_k(truth_rating, top_n_size, method=1)
#             nDCGs.append(nDCG)
#             DCGs.append(DCG)
    
        avg_precision = 0
        for precision in precisions:
            avg_precision += precision
        avg_precision /= len(precisions)
        
#         n_tau = 0
#         avg_tau = 0
#         for tau in taus:
#             if not np.isnan(tau):
#                 avg_tau += tau
#                 n_tau += 1
#         print(avg_tau, n_tau)
#         avg_tau /= n_tau
        
#         avg_nDCG = 0
#         for nDCG in nDCGs:
#             avg_nDCG += nDCG
#         avg_nDCG /= len(nDCGs)      
        
#         avg_DCG = 0
#         for DCG in DCGs:
#             avg_DCG += DCG
#         avg_DCG /= len(DCGs)    
        
        final_results_precision[i].append(avg_precision)
#         final_results_tau[i].append(avg_tau)
#         final_results_nDCG[i].append(avg_nDCG)
#         final_results_DCG[i].append(avg_DCG)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [168]:
for i in final_results_precision:
    print(i)
# print('--------------------------------')
# for i in final_results_tau:
#     print(i)
# print('--------------------------------')
# for i in final_results_DCG:
#     print(i)    
# print('--------------------------------')
# for i in final_results_nDCG:
#     print(i)

[0.3676470588235294, 0.3676470588235294, 0.39117647058823524, 0.4441176470588236, 0.5029411764705882, 0.5294117647058822, 0.5558823529411764, 0.5470588235294118, 0.5235294117647059, 0.48529411764705893, 0.3970588235294118]
[0.5166666666666666, 0.49444444444444446, 0.5055555555555555, 0.5111111111111111, 0.5333333333333333, 0.538888888888889, 0.5777777777777778, 0.5777777777777778, 0.5555555555555556, 0.49444444444444446, 0.4444444444444444]
[0.6749999999999999, 0.6583333333333333, 0.6416666666666667, 0.65, 0.6749999999999999, 0.6833333333333332, 0.7166666666666667, 0.6916666666666665, 0.6666666666666666, 0.6333333333333333, 0.5833333333333334]
[0.5375000000000001, 0.525, 0.5125, 0.5375000000000001, 0.55, 0.5125, 0.55, 0.575, 0.4875, 0.4625, 0.4]
