In [3]:
import numpy as np
import os
import math
import tqdm
import pandas as pd
from tqdm import tqdm_notebook

dir_ = '../data/'
# file_name = 'normalized_filter_track_5_user_100.csv'
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'

In [4]:
train = np.loadtxt(os.path.join(dir_, 'rating_matrix_' + file_name), delimiter=',')
pd_train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))
pd_test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))

In [5]:
np.set_printoptions(suppress=True)
print(train)

[[5.   3.75 4.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]]


In [6]:
pd_train

Unnamed: 0,uid,tid,rating,count
3637403,785,25185,0.25,1
2592334,587,58745,2.25,2
2041631,454,36778,4.50,3
4086920,874,12266,0.25,1
1919709,423,3561,0.25,1
...,...,...,...,...
1876585,411,71785,0.25,1
4041534,864,6379,0.25,1
3792906,816,17437,0.25,1
3106186,682,39513,0.25,1


In [7]:
pd_test

Unnamed: 0,uid,tid,rating,count
566162,128,8212,0.25,1
3642787,786,10705,2.00,2
3009228,665,765,3.00,3
2012160,447,108195,4.75,13
2773009,638,1838,2.75,2
...,...,...,...,...
1388552,311,25542,2.25,2
67785,15,32275,5.00,14
2740587,628,47032,4.25,2
4245781,914,23502,4.25,25


In [8]:
num_users = len(train)
num_tracks = len(train[0])
print(num_users,num_tracks)

953 157567


In [9]:
def pearson_correlation(u1_index, u2_index):
        result = 0.0
        user1_data = train[u1_index]
        user2_data = train[u2_index]

        rx_avg = user_average_rating(user1_data)
        ry_avg = user_average_rating(user2_data)
        sxy = common_items(user1_data, user2_data)

        top_result = 0.0
        bottom_left_result = 0.0
        bottom_right_result = 0.0
        for item in sxy:
            rxs = user1_data[item]
            rys = user2_data[item]

            top_result += (rxs - rx_avg)*(rys - ry_avg)
            bottom_left_result += pow((rxs - rx_avg), 2)
            bottom_right_result += pow((rys - ry_avg), 2)
        bottom_left_result = math.sqrt(bottom_left_result)
        bottom_right_result = math.sqrt(bottom_right_result)
        
        ################################################################
        if (bottom_left_result * bottom_right_result) == 0:
            return -2 # dump the data
        ################################################################
        
        result = top_result/(bottom_left_result * bottom_right_result)
        return result

def user_average_rating(u):
    avg_rating = 0.0
    for i in u:
        avg_rating += i
    avg_rating /= len(u) * 1.0
    return avg_rating

def common_items(u1, u2):
    result = []
    for i in range(0, num_tracks):
        if u1[i] > 0 and u2[i] > 0:
            result.append(i)
    return result

In [10]:
def knn(uid, k):
    neighbors = []
    result = []
    for user in range(0, num_users):
        if uid == user:
            continue
        upc = pearson_correlation(uid, user)
        neighbors.append([user, upc])
    sorted_neighbors = sorted(neighbors, key=lambda neighbors: (neighbors[1], neighbors[0]), reverse=True)   # - for desc sort

    for i in range(k):
        if i >= len(sorted_neighbors):
            break
        result.append(sorted_neighbors[i])
    return result

In [11]:
def predict_by_knn(uid, tid, k_nearest_neighbors):
        valid_neighbors = check_neighbors_validattion(tid, k_nearest_neighbors)
        ##################################
        if not len(valid_neighbors):
            return -1
        ##################################
        top_result = 0.0
        bottom_result = 0.0
        for neighbor in valid_neighbors:
            neighbor_id = neighbor[0]
            neighbor_similarity = neighbor[1]   # Wi1
            rating = train[neighbor_id][tid] # rating i,item
            top_result += neighbor_similarity * rating
            bottom_result += neighbor_similarity
        result = top_result/bottom_result
        return result

def check_neighbors_validattion(tid, k_nearest_neighbors):
    result = []
    for neighbor in k_nearest_neighbors:
        neighbor_id = neighbor[0]
        if train[neighbor_id][tid] > 0:
            result.append(neighbor)
    return result


In [12]:
def predict(uid, tid, k):
    
    if len(knn_list[uid]) == 0:
        k_nearest_neighbors = knn(uid, k)
        knn_list[uid] = k_nearest_neighbors
    else:
        k_nearest_neighbors = knn_list[uid]

    prediction = predict_by_knn(uid, tid, k_nearest_neighbors)
    return prediction

In [26]:
# test data

# test = [
#     [5,5,0,0],
#     [4,5,1,5],
#     [5,4,2,3],
#     [1,5,0,3],
#     [2,3,5,4]
# ]

# num_users = 5
# num_tracks = 4

# temp = train 
# train = test

In [15]:
knn_list = []

for i in range(0, num_users):
    knn_list.append([])


In [None]:
k = 50
pd_prediction = pd.DataFrame(columns=['uid','tid','rating'])

for i in tqdm.tqdm_notebook(range(num_users)):
    
    user = pd_train[pd_train['uid']==i]
    target_user = i
    
    for j in range(num_tracks):
        r = user[user['tid']==j]['rating']
        if len(r) == 0:
            target_item = j
            prediction = predict(target_user, target_item, k)
        else:
            prediction = -1
        if not prediction == -1:
            pd_prediction = pd_prediction.append({'uid': i, 'tid': j, 'rating': prediction}, ignore_index=True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/953 [00:00<?, ?it/s]

In [None]:
pd_prediction

In [None]:
pd_prediction.to_pickle(os.path.join(dir_, 'prediction_cf_user_top_N_' + file_name[:-3] + 'pkl'))

In [None]:
k = knn_list

In [None]:
knn_list

In [None]:
pd_knn = pd.DataFrame(columns=['uid','knn_uid','similarity'])
n=0
for i in knn_list:
    for j in i:
        pd_knn = pd_knn.append({'uid': n, 'knn_uid': j[0], 'similarity': j[1]}, ignore_index=True)
    n += 1

In [None]:
pd_knn.to_pickle(os.path.join(dir_, 'knn_cf_user_top_N_' + file_name[:-3] + 'pkl'))