In [16]:
import numpy as np
import os
import math
import tqdm
import pandas as pd
from tqdm import tqdm_notebook

dir_ = '../data_0.5_track_20/'
# file_name = 'normalized_filter_track_5_user_100.csv'
file_name = 'normalized_to_rating_filter_track_20_user_100.csv'

In [17]:
train = np.loadtxt(os.path.join(dir_, 'rating_matrix_' + file_name), delimiter=',')
pd_train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))
pd_test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))

In [18]:
np.set_printoptions(suppress=True)
print(train)

[[0.   0.   0.25 ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   1.25 0.   ... 2.   0.   0.  ]]


In [19]:
pd_train

Unnamed: 0,uid,tid,rating
761964,143,2600,0.25
942432,163,6175,1.75
294180,60,5301,3.75
999306,170,73,1.00
339170,70,4005,3.75
...,...,...,...
383800,73,1428,4.50
280364,57,8573,3.00
206333,42,3530,3.00
663060,126,3806,0.25


In [20]:
pd_test

Unnamed: 0,uid,tid,rating
936327,162,8410,0.75
261254,51,5512,4.75
311557,65,2018,0.25
102036,17,5634,2.50
397867,74,1053,3.00
...,...,...,...
447006,86,644,3.25
1193620,193,8585,4.00
1088496,180,6548,3.75
344501,71,577,1.75


In [21]:
num_users = len(train)
num_tracks = len(train[0])
print(num_users,num_tracks)

211 8941


In [22]:
def pearson_correlation(u1_index, u2_index):
        result = 0.0
        user1_data = train[u1_index]
        user2_data = train[u2_index]

        rx_avg = user_average_rating(user1_data)
        ry_avg = user_average_rating(user2_data)
        sxy = common_items(user1_data, user2_data)

        top_result = 0.0
        bottom_left_result = 0.0
        bottom_right_result = 0.0
        for item in sxy:
            rxs = user1_data[item]
            rys = user2_data[item]

            top_result += (rxs - rx_avg)*(rys - ry_avg)
            bottom_left_result += pow((rxs - rx_avg), 2)
            bottom_right_result += pow((rys - ry_avg), 2)
        bottom_left_result = math.sqrt(bottom_left_result)
        bottom_right_result = math.sqrt(bottom_right_result)
        
        ################################################################
        if (bottom_left_result * bottom_right_result) == 0:
            return -2 # dump the data
        ################################################################
        
        result = top_result/(bottom_left_result * bottom_right_result)
        return result

def user_average_rating(u):
    avg_rating = 0.0
    for i in u:
        avg_rating += i
    avg_rating /= len(u) * 1.0
    return avg_rating

def common_items(u1, u2):
    result = []
    for i in range(0, num_tracks):
        if u1[i] > 0 and u2[i] > 0:
            result.append(i)
    return result

In [23]:
def knn(uid, k):
    neighbors = []
    result = []
    for user in range(0, num_users):
        if uid == user:
            continue
        upc = pearson_correlation(uid, user)
        neighbors.append([user, upc])
    sorted_neighbors = sorted(neighbors, key=lambda neighbors: (neighbors[1], neighbors[0]), reverse=True)   # - for desc sort

    for i in range(k):
        if i >= len(sorted_neighbors):
            break
        result.append(sorted_neighbors[i])
    return result

In [24]:
def predict_by_knn(uid, tid, k_nearest_neighbors):
        valid_neighbors = check_neighbors_validattion(tid, k_nearest_neighbors)
        ##################################
        if not len(valid_neighbors):
            return -1
        ##################################
        top_result = 0.0
        bottom_result = 0.0
        for neighbor in valid_neighbors:
            neighbor_id = neighbor[0]
            neighbor_similarity = neighbor[1]   # Wi1
            rating = train[neighbor_id][tid] # rating i,item
            top_result += neighbor_similarity * rating
            bottom_result += neighbor_similarity
        result = top_result/bottom_result
        return result

def check_neighbors_validattion(tid, k_nearest_neighbors):
    result = []
    for neighbor in k_nearest_neighbors:
        neighbor_id = neighbor[0]
        if train[neighbor_id][tid] > 0:
            result.append(neighbor)
    return result


In [25]:
def predict(uid, tid, k):
    
    if len(knn_list[uid]) == 0:
        k_nearest_neighbors = knn(uid, k)
        knn_list[uid] = k_nearest_neighbors
    else:
        k_nearest_neighbors = knn_list[uid]

    prediction = predict_by_knn(uid, tid, k_nearest_neighbors)
    return prediction

In [26]:
# test data

# test = [
#     [5,5,0,0],
#     [4,5,1,5],
#     [5,4,2,3],
#     [1,5,0,3],
#     [2,3,5,4]
# ]

# num_users = 5
# num_tracks = 4

# temp = train 
# train = test

In [27]:
knn_list = []

for i in range(0, num_users):
    knn_list.append([])


In [28]:
k = 30
pd_prediction = pd.DataFrame(columns=['uid','tid','rating'])

for i in tqdm.tqdm_notebook(range(num_users)):
    
    user = pd_train[pd_train['uid']==i]
    target_user = i
    
    for j in range(num_tracks):
        r = user[user['tid']==j]['rating']
        if len(r) == 0:
            target_item = j
            prediction = predict(target_user, target_item, k)
        else:
            prediction = -1
        if not prediction == -1:
            pd_prediction = pd_prediction.append({'uid': i, 'tid': j, 'rating': prediction}, ignore_index=True)

HBox(children=(IntProgress(value=0, max=211), HTML(value='')))




In [29]:
pd_prediction

Unnamed: 0,uid,tid,rating
0,0.0,1.0,2.592018
1,0.0,3.0,2.797907
2,0.0,4.0,1.595805
3,0.0,6.0,4.129279
4,0.0,8.0,4.320885
...,...,...,...
1153553,210.0,8935.0,2.111489
1153554,210.0,8936.0,3.422536
1153555,210.0,8937.0,3.500000
1153556,210.0,8939.0,2.750000


In [30]:
pd_prediction.to_pickle(os.path.join(dir_, 'prediction_cf_user_top_N_' + file_name[:-3] + 'pkl'))

In [31]:
k = knn_list

In [32]:
knn_list

[[[207, 0.9371454848347652],
  [124, 0.8657761283643329],
  [117, 0.8616512214636592],
  [41, 0.8552868863849112],
  [167, 0.8425595301631793],
  [53, 0.83823329998654],
  [5, 0.8348340834584046],
  [208, 0.8338066282594614],
  [95, 0.8334671042815553],
  [83, 0.8242776699171392],
  [116, 0.8012259362087059],
  [154, 0.8005785946667995],
  [80, 0.7943085847795143],
  [140, 0.7850020803481467],
  [69, 0.7781294372020058],
  [46, 0.771481616235049],
  [136, 0.7684105210214018],
  [85, 0.7682540943235973],
  [121, 0.7630848464813429],
  [139, 0.7623295363719298],
  [132, 0.7594500201532621],
  [9, 0.7582417553669101],
  [177, 0.7546744545321581],
  [16, 0.7461124302192055],
  [22, 0.7417266264918628],
  [152, 0.7410275304391406],
  [111, 0.740384473828982],
  [160, 0.7377604779028485],
  [63, 0.7371171317331761],
  [206, 0.7346606893877295]],
 [[160, 1.0],
  [95, 1.0],
  [92, 1.0],
  [90, 1.0],
  [85, 1.0],
  [62, 1.0],
  [61, 1.0],
  [57, 1.0],
  [53, 1.0],
  [35, 1.0],
  [27, 1.0],
  [2

In [33]:
pd_knn = pd.DataFrame(columns=['uid','knn_uid','similarity'])
n=0
for i in knn_list:
    for j in i:
        pd_knn = pd_knn.append({'uid': n, 'knn_uid': j[0], 'similarity': j[1]}, ignore_index=True)
    n += 1

In [34]:
pd_knn.to_pickle(os.path.join(dir_, 'knn_cf_user_top_N_' + file_name[:-3] + 'pkl'))