In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
from scipy import sparse
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from MF import MF
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import math

In [3]:
matrix = sparse.load_npz('rating_matrix.npz')
train_matrix = (matrix > 3)

In [4]:
class dataset(Dataset):
    def __init__(self, mat):
        mat = mat.tocoo()
        self.tuples = np.array([mat.row, mat.col]).T
        mat = mat.tocsr()
        self.pos_items = []
        for i in range(mat.shape[0]):
            self.pos_items.append(mat[i].indices)
        self.num_items = mat.shape[1]
    
    def __len__(self):
        return len(self.tuples)
    
    def __getitem__(self, index):
        u, i = self.tuples[index]
        j = np.random.randint(self.num_items)
        while j in self.pos_items[u]:
            j = np.random.randint(self.num_items)
        return u, i, j

In [5]:
train_dataset = dataset(train_matrix)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=8)

In [4]:
(num_users, num_books) = train_matrix.shape
print(num_users, num_books)
num_factors = 1024
model = MF(num_users, num_books, n_factors=num_factors).cuda()

53424 10000


In [7]:
model.load_state_dict(torch.load(f'BPR_model_{num_factors}_100.pth'))

<All keys matched successfully>

In [8]:
num_epochs = 50
loss_func = nn.LogSigmoid()
lr = 1e-2
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=1e-5)
for epoch in range(num_epochs):
    model.train()
    losses = []
    iterator = tqdm(train_dataloader)
    for x in iterator:
        optimizer.zero_grad()
        user, pos, neg = x
        user = user.cuda().long()
        pos = pos.cuda().long()
        neg = neg.cuda().long()

        loss = -loss_func(model(user, pos, neg)).sum()
        losses += [loss.item()]
        loss.backward()
        optimizer.step()
        iterator.set_postfix_str('loss={:^7.6f}'.format(loss))
    avg_loss = np.mean(losses)
    print(avg_loss)
    torch.save(model.state_dict(), f'BPR_model_{num_factors}_150.pth')

100%|██████████| 16102/16102 [02:28<00:00, 108.26it/s, loss=8.756596] 


8.440561296214023


100%|██████████| 16102/16102 [02:29<00:00, 108.03it/s, loss=7.868371] 


8.29432699169791


100%|██████████| 16102/16102 [02:28<00:00, 108.54it/s, loss=5.617651] 


8.146777162328593


100%|██████████| 16102/16102 [02:27<00:00, 109.28it/s, loss=5.456594] 


8.045520806146726


100%|██████████| 16102/16102 [02:27<00:00, 108.86it/s, loss=8.449814] 


7.912617197097155


100%|██████████| 16102/16102 [02:39<00:00, 101.21it/s, loss=6.338704] 


7.824380004365997


100%|██████████| 16102/16102 [02:28<00:00, 108.25it/s, loss=8.823460] 


7.764679207015283


100%|██████████| 16102/16102 [02:28<00:00, 108.29it/s, loss=8.780653] 


7.625616282857645


100%|██████████| 16102/16102 [02:27<00:00, 108.90it/s, loss=8.517543] 


7.5447602927381245


100%|██████████| 16102/16102 [02:28<00:00, 108.39it/s, loss=5.388515] 


7.462567429530104


100%|██████████| 16102/16102 [02:28<00:00, 108.55it/s, loss=6.618046] 


7.3776065482662085


100%|██████████| 16102/16102 [02:28<00:00, 108.23it/s, loss=11.584841]


7.302801439546233


100%|██████████| 16102/16102 [02:28<00:00, 108.50it/s, loss=6.185818] 


7.20869353267365


100%|██████████| 16102/16102 [02:28<00:00, 108.72it/s, loss=6.035801] 


7.120680938010156


100%|██████████| 16102/16102 [02:39<00:00, 101.19it/s, loss=8.195656] 


7.027724953915462


100%|██████████| 16102/16102 [02:50<00:00, 94.49it/s, loss=8.079040]  


6.9610608343070615


100%|██████████| 16102/16102 [02:51<00:00, 94.10it/s, loss=6.980142]  


6.895409392867824


100%|██████████| 16102/16102 [02:28<00:00, 108.51it/s, loss=5.299489] 


6.812372577692349


100%|██████████| 16102/16102 [02:28<00:00, 108.09it/s, loss=8.165319] 


6.6945948141241


100%|██████████| 16102/16102 [02:28<00:00, 108.28it/s, loss=4.453620] 


6.647986720103149


100%|██████████| 16102/16102 [02:28<00:00, 108.17it/s, loss=9.882299] 


6.590878767799019


100%|██████████| 16102/16102 [02:28<00:00, 108.53it/s, loss=5.276668] 


6.516164936222448


100%|██████████| 16102/16102 [02:28<00:00, 108.72it/s, loss=4.585057] 


6.431759883159288


100%|██████████| 16102/16102 [02:28<00:00, 108.55it/s, loss=8.556438] 


6.3458307199471635


100%|██████████| 16102/16102 [02:27<00:00, 108.94it/s, loss=5.896319] 


6.302593303324939


100%|██████████| 16102/16102 [02:32<00:00, 105.35it/s, loss=4.171910] 


6.245694592826456


100%|██████████| 16102/16102 [02:50<00:00, 94.63it/s, loss=5.204963]  


6.149800194692736


100%|██████████| 16102/16102 [02:50<00:00, 94.43it/s, loss=5.398619]  


6.135352731913369


100%|██████████| 16102/16102 [02:50<00:00, 94.38it/s, loss=5.232207]  


6.014793191181475


100%|██████████| 16102/16102 [02:51<00:00, 93.95it/s, loss=5.723948]  


5.976185014685878


100%|██████████| 16102/16102 [02:49<00:00, 94.73it/s, loss=7.555933]  


5.928902546858406


100%|██████████| 16102/16102 [02:50<00:00, 94.61it/s, loss=6.592320]  


5.872322979676414


100%|██████████| 16102/16102 [02:49<00:00, 95.03it/s, loss=3.227235]  


5.779622205428168


100%|██████████| 16102/16102 [02:51<00:00, 94.13it/s, loss=4.562657]  


5.7532725927030475


100%|██████████| 16102/16102 [02:50<00:00, 94.56it/s, loss=7.860871]  


5.695649530203235


100%|██████████| 16102/16102 [02:51<00:00, 93.65it/s, loss=7.177601] 


5.624919224756725


100%|██████████| 16102/16102 [02:50<00:00, 94.56it/s, loss=3.984082] 


5.565980194481452


100%|██████████| 16102/16102 [02:50<00:00, 94.20it/s, loss=8.695886]  


5.526837580064234


100%|██████████| 16102/16102 [02:50<00:00, 94.47it/s, loss=5.263221]  


5.461366740143413


100%|██████████| 16102/16102 [02:49<00:00, 94.96it/s, loss=4.474452]  


5.404322966842559


100%|██████████| 16102/16102 [02:49<00:00, 95.15it/s, loss=5.375572]  


5.366068496480223


100%|██████████| 16102/16102 [02:49<00:00, 94.87it/s, loss=4.870200]  


5.310795578730357


100%|██████████| 16102/16102 [02:49<00:00, 95.01it/s, loss=4.716157]  


5.271576398071362


100%|██████████| 16102/16102 [02:49<00:00, 95.00it/s, loss=7.485892] 


5.19836072825657


100%|██████████| 16102/16102 [02:49<00:00, 95.10it/s, loss=2.838324]  


5.155066901616407


100%|██████████| 16102/16102 [02:49<00:00, 95.05it/s, loss=4.979881]  


5.133332491239366


100%|██████████| 16102/16102 [02:49<00:00, 94.99it/s, loss=4.371506]  


5.062786182436554


100%|██████████| 16102/16102 [02:49<00:00, 94.88it/s, loss=3.695621]  


5.02133317187534


100%|██████████| 16102/16102 [02:49<00:00, 94.92it/s, loss=8.520086]  


4.981657166241741


100%|██████████| 16102/16102 [02:50<00:00, 94.44it/s, loss=4.487764]  


4.948647271215716


In [5]:
model.load_state_dict(torch.load(f'BPR_model_{num_factors}_150.pth'))
predict_mat = (model.user_mat @ model.item_mat.T).cpu().detach().numpy()

In [10]:
import csv

ground_truth = [[] for _ in range(num_users)]
with open('to_read.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for data in reader:
        ground_truth[int(data['user_id']) - 1].append(int(data['book_id']) - 1)

In [26]:
mAP = 0
mP = 0
count = 0
for i in tqdm(range(num_users)):
    ground_len = len(ground_truth[i])
    if ground_len == 0:
        continue
    else:
        ranking = (-predict_mat[i]).argsort()
        top_count = 0
        top = []
        for j in range(len(ranking)):
            if ranking[j] not in train_matrix[i].indices:
                top.append(ranking[j])
                top_count += 1
                if top_count == 10:
                    break
        count = count + 1
        ap = 0
        p = 0
        correct_count = 0
        for j in range(len(top)):
            if top[j] in ground_truth[i]:
                correct_count = correct_count + 1
                ap = ap + correct_count / (j + 1)
                p += 1
        if correct_count != 0:
            ap = ap / ground_len
            p = p / ground_len
        else:
            ap = 0
            p = 0
        mAP = mAP + ap
        mP = mP + p
    
mAP = mAP / count
precision_at_10 = mP / count
print("mAP:", mAP, 'precision@10: ', precision_at_10) 

100%|██████████| 53424/53424 [02:33<00:00, 348.92it/s]

mAP: 0.013090075600550121 mP:  0.0356092805398107





In [17]:
from sklearn.metrics.pairwise import cosine_similarity

new_user = np.zeros(num_books)
book_list = [3, 5, 7]
new_user[book_list] = 5
new_user = new_user.reshape(1, -1)
sim = np.array(cosine_similarity(new_user, matrix)).flatten()

In [16]:
print((-sim).argsort()[:20])

[37704  8078 14297 23000 32457 19627  7548  6428 44388  1558 15022  7247
  8392 14751 18266 33076 47564  3908 17039  8659]


In [54]:
new_user = np.zeros(num_books)
book_list = [41]
book_list = [i - 1 for i in book_list]
new_user[book_list] = 5
new_user = new_user.reshape(1, -1)
sim = np.array(cosine_similarity(new_user, matrix)).flatten()

In [55]:
top_users = (-sim).argsort()[:20]
print(sim[top_users])

[0.30372837 0.30372837 0.23649188 0.23038784 0.21229796 0.20725667
 0.19360077 0.19312182 0.18752289 0.18738292 0.18355577 0.1814885
 0.17799766 0.17655614 0.1763364  0.17418325 0.17365718 0.17313584
 0.17220915 0.1721071 ]


In [56]:
pred = predict_mat[top_users].sum(axis=0)
rank = (-pred).argsort()[:20]
rank = [i + 1 for i in rank]
print(rank)

[41, 2, 159, 151, 214, 175, 1, 163, 18, 25, 21, 24, 23, 294, 27, 12, 17, 428, 370, 20]
