In [1]:
import numpy as np
import os
import math
import tqdm
import pandas as pd
from tqdm import tqdm_notebook

dir_ = '../data/'
# file_name = 'normalized_filter_track_5_user_100.csv'
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'

In [2]:
train = np.loadtxt(os.path.join(dir_, 'rating_matrix_' + file_name), delimiter=',')
pd_train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))
pd_test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))

In [3]:
np.set_printoptions(suppress=True)
print(train)

[[5.   3.75 4.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]]


In [4]:
pd_train

Unnamed: 0,uid,tid,rating,count
3637403,785,25185,0.25,1
2592334,587,58745,2.25,2
2041631,454,36778,4.50,3
4086920,874,12266,0.25,1
1919709,423,3561,0.25,1
...,...,...,...,...
1876585,411,71785,0.25,1
4041534,864,6379,0.25,1
3792906,816,17437,0.25,1
3106186,682,39513,0.25,1


In [5]:
pd_test

Unnamed: 0,uid,tid,rating,count
566162,128,8212,0.25,1
3642787,786,10705,2.00,2
3009228,665,765,3.00,3
2012160,447,108195,4.75,13
2773009,638,1838,2.75,2
...,...,...,...,...
1388552,311,25542,2.25,2
67785,15,32275,5.00,14
2740587,628,47032,4.25,2
4245781,914,23502,4.25,25


In [6]:
num_users = len(train)
num_tracks = len(train[0])
print(num_users,num_tracks)

953 157567


In [7]:
pd_train['u'] = 1
pop_u = pd_train.groupby(["tid"]).sum().reset_index().sort_values(by='u', ascending=False)
pop_u = pop_u.sort_values(by=['tid'], ascending=False)
pop_u

Unnamed: 0,tid,uid,rating,count,u
157554,157566,2692,9.00,17,3
157553,157565,3488,15.25,17,4
157552,157564,2684,0.75,3,3
157551,157563,4337,10.75,17,5
157550,157562,3167,7.50,14,4
...,...,...,...,...,...
4,4,5860,21.00,39,12
3,3,4935,21.75,40,12
2,2,4425,5.75,15,8
1,1,23854,97.25,187,53


In [8]:
zipbObj = zip(pop_u['tid'].values, pop_u['u'].values)
pop_u = dict(zipbObj)

In [22]:
def jaccard(u1_index, u2_index):
        result = 0.0
        user1_data = train[u1_index]
        user2_data = train[u2_index]
                
        top = []
        bottom = []
        for i in range(0, num_tracks):
            if user1_data[i] > 0 and user2_data[i] > 0:
                top.append(1/pop_u[i])
            if user1_data[i] > 0 or user2_data[i] > 0:
                bottom.append(1/pop_u[i])
        if len(top)!=0:
            top = sum(top)/len(top)
        else:
            top = 0
        
        if len(bottom)!=0:
            bottom = sum(bottom)/len(bottom)
        else:
            bottom = 0
        
        ################################################################
        if bottom == 0:
            return -2 # dump the data
        ################################################################

        return top/bottom

In [16]:
def knn(uid, k):
    neighbors = []
    result = []
    for user in range(0, num_users):
        if uid == user:
            continue
        sim = jaccard(uid, user)
        neighbors.append([user, sim])
    sorted_neighbors = sorted(neighbors, key=lambda neighbors: (neighbors[1], neighbors[0]), reverse=True)   # - for desc sort

    for i in range(k):
        if i >= len(sorted_neighbors):
            break
        result.append(sorted_neighbors[i])
    return result

In [17]:
def predict_by_knn(uid, tid, k_nearest_neighbors):
        valid_neighbors = check_neighbors_validattion(tid, k_nearest_neighbors)
        ##################################
        if not len(valid_neighbors):
            return -1
        ##################################
        top_result = 0.0
        bottom_result = 0.0
        for neighbor in valid_neighbors:
            neighbor_id = neighbor[0]
            neighbor_similarity = neighbor[1]   # Wi1
            rating = train[neighbor_id][tid] # rating i,item
            if rating > 0:
                rating = 1
            top_result += neighbor_similarity * rating
            bottom_result += neighbor_similarity
        result = top_result/bottom_result/pop_u[i] ### 需要修改
        return result

def check_neighbors_validattion(tid, k_nearest_neighbors):
    result = []
    for neighbor in k_nearest_neighbors:
        neighbor_id = neighbor[0]
        if train[neighbor_id][tid] > 0:
            result.append(neighbor)
    return result


In [18]:
def predict(uid, tid, k):
    
    if len(knn_list[uid]) == 0:
        k_nearest_neighbors = knn(uid, k)
        knn_list[uid] = k_nearest_neighbors
    else:
        k_nearest_neighbors = knn_list[uid]

    prediction = predict_by_knn(uid, tid, k_nearest_neighbors)
    return prediction

In [19]:
knn_list = []

for i in range(0, num_users):
    knn_list.append([])

In [None]:
k = 50
pd_prediction = pd.DataFrame(columns=['uid','tid','rating'])

for u in tqdm.tqdm_notebook(range(num_users)):
    
    user = pd_train[pd_train['uid']==u]
    
    for j in range(num_tracks):
        r = user[user['tid']==j]['rating']
        if len(r) == 0:
            target_item = j
            prediction = predict(u, target_item, k)
        else:
            prediction = -1
        if not prediction == -1:
            pd_prediction = pd_prediction.append({'uid': i, 'tid': j, 'rating': prediction}, ignore_index=True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/953 [00:00<?, ?it/s]

In [None]:
pd_prediction

In [None]:
pd_prediction.to_pickle(os.path.join(dir_, 'prediction_utsp1_top_N_' + file_name[:-3] + 'pkl'))

In [None]:
k = knn_list

In [None]:
knn_list

In [None]:
pd_knn = pd.DataFrame(columns=['uid','knn_uid','similarity'])
n=0
for i in knn_list:
    for j in i:
        pd_knn = pd_knn.append({'uid': n, 'knn_uid': j[0], 'similarity': j[1]}, ignore_index=True)
    n += 1

In [None]:
pd_knn.to_pickle(os.path.join(dir_, 'knn_utsp1_top_N_' + file_name[:-3] + 'pkl'))