In [1]:
import os 
import math
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from numpy import count_nonzero

dir_ = '../../../../../data/'
group_dir_ = '../../../../../data/groups/random'

In [2]:
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'
df = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))
svd = pd.read_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))
train = np.loadtxt(os.path.join(dir_, 'rating_matrix_normalized_to_rating_filter_track_5_user_100.csv'), delimiter=',')
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])

num_user = len(train)
num_track = len(train[0])
num_user, num_track

(953, 157567)

In [3]:
groups_n =[]
group_sizes = ['2', '3', '4', '5', '6', '7', '8']
for i in range(len(group_sizes)):
    groups = np.loadtxt(os.path.join(group_dir_, group_sizes[i] + '.csv'), delimiter=',')
    groups_n.append(groups)

In [4]:
def pearson_correlation(u1_index, u2_index):
        result = 0.0
        user1_data = train[u1_index]
        user2_data = train[u2_index]

        rx_avg = user_average_rating(user1_data)
        ry_avg = user_average_rating(user2_data)
        sxy = common_items(user1_data, user2_data)

        top_result = 0.0
        bottom_left_result = 0.0
        bottom_right_result = 0.0
        for item in sxy:
            rxs = user1_data[item]
            rys = user2_data[item]

            top_result += (rxs - rx_avg)*(rys - ry_avg)
            bottom_left_result += pow((rxs - rx_avg), 2)
            bottom_right_result += pow((rys - ry_avg), 2)
        bottom_left_result = math.sqrt(bottom_left_result)
        bottom_right_result = math.sqrt(bottom_right_result)
        
        ################################################################
        if (bottom_left_result * bottom_right_result) == 0:
            return -2, -2 # dump the data
        ################################################################
        
        result = top_result/(bottom_left_result * bottom_right_result)
        return len(sxy), result

def user_average_rating(u):
    avg_rating = 0.0
    for i in u:
        avg_rating += i
    avg_rating /= len(u) * 1.0
    return avg_rating

def common_items(u1, u2):
    result = []
    for i in range(num_track):
        if u1[i] > 0 and u2[i] > 0:
            result.append(i)
    return result

In [5]:
similarities = []
for groups in tqdm(groups_n):
    for gid in range(len(groups)):
        similarity = []
        group = groups[gid]
        for i in range(len(group)):
            for j in range(i+1, len(group)):
                u1 = int(group[i])
                u2 = int(group[j])
                c, p = pearson_correlation(u1, u2)
                p = p * min(c,200) / 200
                similarity.append(p)
        similarities.append([len(group), gid, sum(similarity)/len(similarity)])

  0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
pd_similarity = pd.DataFrame(similarities, columns=['group_size', 'gid', 'similarity'])
pd_similarity

Unnamed: 0,group_size,gid,similarity
0,2,0,0.723247
1,2,1,0.651373
2,2,2,0.389279
3,2,3,0.257957
4,2,4,0.070724
...,...,...,...
1629,8,114,0.299821
1630,8,115,0.492225
1631,8,116,0.388159
1632,8,117,0.298092


In [11]:
s = pd_similarity[pd_similarity['group_size']==8]['similarity']
sum(s)/len(s)

0.2792671677413597

In [12]:
pd_similarity.to_pickle("similarity.pkl")

In [17]:
pd_similarity[(pd_similarity['group_size']==2) & (pd_similarity['gid']==0)]['similarity'].values[0]

0.7232472734701212

In [13]:
top_n = './50'
p = pd.read_pickle(os.path.join(top_n, 'r_p.pkl'))

In [24]:
# Choose lambda
opt_lambdas = []
for g_size in range(2,9):
    target_groups = p[p['group_size']==g_size]
    for gid in target_groups['gid'].unique():
        target_group = target_groups[target_groups['gid']==gid]
        p_lambdas = []
        for lambda_ in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
            p_lambda = target_group[target_group['lambda']==lambda_]
            p_lambdas.append(sum(p_lambda['values'].values)/len(p_lambda['values'].values))
        opt_lambda = p_lambdas.index(max(p_lambdas))/10
        similarity = pd_similarity[(pd_similarity['group_size']==g_size) & (pd_similarity['gid']==gid)]['similarity'].values[0]
        opt_lambdas.append([g_size, target_id, similarity, opt_lambda])

In [25]:
pd_opt_lambda = pd.DataFrame(opt_lambdas, columns=['group_size', 'gid', 'similarity', 'opt_lambda'])
pd_opt_lambda

Unnamed: 0,group_size,gid,similarity,opt_lambda
0,2,118,0.723247,1.0
1,2,118,0.651373,0.6
2,2,118,0.389279,0.5
3,2,118,0.257957,0.5
4,2,118,0.070724,1.0
...,...,...,...,...
1629,8,118,0.299821,0.7
1630,8,118,0.492225,0.3
1631,8,118,0.388159,0.5
1632,8,118,0.298092,0.6


In [27]:
from sklearn.linear_model import LinearRegression

In [34]:
train = pd_opt_lambda[pd_opt_lambda['group_size']==2]
train

Unnamed: 0,group_size,gid,similarity,opt_lambda
0,2,118,0.723247,1.0
1,2,118,0.651373,0.6
2,2,118,0.389279,0.5
3,2,118,0.257957,0.5
4,2,118,0.070724,1.0
...,...,...,...,...
471,2,118,0.110256,0.3
472,2,118,0.044898,0.0
473,2,118,0.541721,1.0
474,2,118,0.289999,0.6


In [58]:
X = train['similarity'].values
y = train['opt_lambda'].values

X_train = np.array(X[:int(len(X)*0.8)])
y_train = np.array(y[:int(len(y)*0.8)])

X_test = np.array(X[int(len(X)*0.8):])
y_test = np.array(y[int(len(y)*0.8):])

X_train = X_train[:, np.newaxis]
X_test = X_test[:, np.newaxis]
len(X_train), len(X_test)

(380, 96)

In [60]:
lm = LinearRegression()
lm.fit(X_train, y_train)

mse = np.mean((lm.predict(X_test) - y_test) ** 2)
print(mse)

0.083809383839115
