In [2]:
from scipy import sparse
import numpy as np

train_set = sparse.load_npz('train_matrix.npz')
test_set = sparse.load_npz('test_matrix.npz')

user_mean = []
ISBN_mean = []

train_data = np.asarray(train_set.tocoo().data, dtype = np.float64)
train_rows = train_set.tocoo().row  #user
train_cols = train_set.tocoo().col  #book

test_data = np.asarray(test_set.tocoo().data, dtype = np.float64)
test_rows = test_set.tocoo().row  #user
test_cols = test_set.tocoo().col  #book

train_mean = np.mean(train_data)

In [42]:
#calculate users' mean rating
print("start user mean")
for i in range(train_set.shape[0]):
    index = np.where(train_rows == i)
    if len(train_data[index[0]]) == 0:
        user_mean.append(train_mean)
    else:
        user_mean.append(np.mean(train_data[index[0]]))
user_mean = train_set.mean(axis=1)
print("end user mean")

start user mean
end user mean


In [43]:
#calculate books' mean rating
print("start book mean")
for i in range(train_set.shape[1]):
    index = np.where(train_cols == i)
    if len(train_data[index[0]]) == 0:
        ISBN_mean.append(train_mean)
    else:
        ISBN_mean.append(np.mean(train_data[index[0]]))
        train_data[index[0]] -= np.mean(train_data[index[0]])
train_set = sparse.csr_matrix((train_data, (train_rows, train_cols)), shape=train_set.shape)
print("end book mean")

start book mean
end book mean


In [41]:
bias = np.zeros(shape=train_set.shape)
for i in range(train_set.shape[0]):
    bias[i] += user_mean[i]
for j in range(train_set.shape[1]):
    bias[:,j] += ISBN_mean[j]
bias -= train_mean

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(train_set.transpose())
print(cosine_sim.shape)
print(cosine_sim)

(10000, 10000)
[[ 1.00000000e+00  1.38366720e-01  1.70106397e-01 ...  3.66900853e-03
  -1.05558092e-03  1.56544201e-02]
 [ 1.38366720e-01  1.00000000e+00  7.02224805e-02 ...  5.73536118e-03
   1.30377977e-03  4.65403185e-03]
 [ 1.70106397e-01  7.02224805e-02  1.00000000e+00 ...  9.47986052e-04
  -6.49974202e-04 -2.47729880e-03]
 ...
 [ 3.66900853e-03  5.73536118e-03  9.47986052e-04 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.05558092e-03  1.30377977e-03 -6.49974202e-04 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [ 1.56544201e-02  4.65403185e-03 -2.47729880e-03 ...  0.00000000e+00
   0.00000000e+00  1.00000000e+00]]


In [55]:
ISBN_mean = np.array(ISBN_mean)

In [93]:
from tqdm import tqdm 

In [100]:
print("start predict default")
predict = np.zeros(shape=train_set.shape)
for i in tqdm(range(train_set.shape[1])):
    sim_book = (-cosine_sim[i]).argsort()[1:51]
    pearson_cor_div = 0
    cols = train_set[:, sim_book].toarray()
    cosine_sims = cosine_sim[i, sim_book]
    predict[:, i] = np.dot(cols + ISBN_mean[sim_book] - bias[:, sim_book], cosine_sims) / np.sum(cosine_sims) + bias[:, i]
#     print(predict[:,i])
print("end predict default")

  0%|          | 0/10000 [00:00<?, ?it/s]

start predict default


100%|██████████| 10000/10000 [19:43<00:00,  8.45it/s]

end predict default





In [101]:
np.savez_compressed('pearson_predict.npz', predict)

In [3]:
predict = np.load('pearson_predict.npz')['arr_0']

In [13]:
original_data = sparse.load_npz('rating_matrix.npz')

In [46]:
#evaluate
from tqdm import tqdm 
import math
import csv

print("start evaluate default")
#rmse
mse = 0
for i in tqdm(range(len(test_data))):
    error = (predict[test_rows[i]][test_cols[i]] - test_data[i]) ** 2
    mse += error
rmse = math.sqrt(mse)
print("rmse = ", rmse)
print("average error = ", error / len(test_data))

#map
print("start map")
ground_truth = []
for i in range(train_set.shape[0]):
    ground_truth.append([])
with open('to_read.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for data in reader:
        ground_truth[int(data['user_id']) - 1].append(int(data['book_id']) - 1)
        
mAP = 0
count = 0
for i in tqdm(range(train_set.shape[0])):
    ground_len = len(ground_truth[i])
    if ground_len == 0:
        continue
    else:
        ranking = (-predict[i]).argsort()
        top_count = 0
        top = []
        for j in range(len(ranking)):
            if ranking[j] not in original_data[i].indices:
                top.append(ranking[j])
                top_count += 1
                if top_count == ground_len * 50:
                    break
        count = count + 1
        ap = 0
        correct_count = 0
        for j in range(len(top)):
            if top[j] in ground_truth[i]:
                correct_count = correct_count + 1
                ap = ap + correct_count / (j + 1)
        if correct_count != 0:
            ap = ap / ground_len
        else:
            ap = 0
        mAP = mAP + ap
mAP = mAP / count
print("mAP:", mAP)   

  7%|▋         | 43851/597648 [00:00<00:01, 438440.38it/s]

start evaluate default


100%|██████████| 597648/597648 [00:01<00:00, 486174.45it/s]


rmse =  681.5263687451162
average error =  3.2591557665193204e-08
start map


 12%|█▏        | 6536/53424 [04:48<34:27, 22.68it/s]  


KeyboardInterrupt: 

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
from scipy import sparse
import numpy as np

train_set = sparse.load_npz('train_matrix.npz')
test_set = sparse.load_npz('test_matrix.npz')

user_mean = []
ISBN_mean = []

train_data = np.asarray(train_set.tocoo().data, dtype = np.float64)
train_rows = train_set.tocoo().row  #user
train_cols = train_set.tocoo().col  #book

test_data = np.asarray(test_set.tocoo().data, dtype = np.float64)
test_rows = test_set.tocoo().row  #user
test_cols = test_set.tocoo().col  #book

train_mean = np.mean(train_data)

In [3]:
#calculate users' mean rating
print("start user mean")
user_mean = np.array(train_set.mean(axis=1)).flatten()
nonzeros = np.diff(train_set.indptr)
user_mean = user_mean * train_set.shape[1] / nonzeros
print(user_mean)
print("end user mean")

start user mean
[3.6        4.46428571 1.76923077 ... 4.25225225 4.44776119 4.40495868]
end user mean


In [4]:
#calculate books' mean rating
print("start book mean")
train_set = train_set.tocsc()
ISBN_mean = np.array(train_set.mean(axis=0)).flatten()
nonzeros = []
for i in range(train_set.shape[1]):
    nonzeros.append(len(train_set[:, i].data))
nonzeros = np.array(nonzeros)
ISBN_mean = ISBN_mean * train_set.shape[0] / nonzeros
print(ISBN_mean)
train_set = train_set.tocsr()
print("end book mean")

start book mean
[4.27734242 4.35082451 3.2184571  ... 4.32978723 3.7008547  4.01      ]
end book mean


In [5]:
bias = np.zeros(shape=train_set.shape)
for i in range(train_set.shape[0]):
    bias[i] += user_mean[i]
for j in range(train_set.shape[1]):
    bias[:,j] += ISBN_mean[j]
bias -= train_mean

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(train_set.transpose())
print(cosine_sim.shape)
print(cosine_sim)

(10000, 10000)
[[1.         0.52834589 0.50534332 ... 0.01548447 0.03015078 0.01044099]
 [0.52834589 1.         0.46841649 ... 0.0206089  0.0210829  0.01384943]
 [0.50534332 0.46841649 1.         ... 0.00423889 0.01458847 0.00197905]
 ...
 [0.01548447 0.0206089  0.00423889 ... 1.         0.         0.        ]
 [0.03015078 0.0210829  0.01458847 ... 0.         1.         0.        ]
 [0.01044099 0.01384943 0.00197905 ... 0.         0.         1.        ]]


In [7]:
from scipy import sparse
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

device = torch.device('cuda')

class dataset(Dataset):
    def __init__(self, mat, sim, bias):
        self.mat = mat.toarray()
        self.data = mat.tocoo().data
        self.rows = mat.tocoo().row
        self.cols = mat.tocoo().col
        self.simbook = []
        for i in tqdm(range(mat.shape[1])):
            self.simbook.append((-sim[i]).argsort()[1:21])
        self.bias = bias
    def __getitem__(self, index):
        return self.data[index], self.cols[index], self.simbook[self.cols[index]], self.mat[self.rows[index], self.simbook[self.cols[index]]], \
            self.bias[self.rows[index]][self.cols[index]], self.bias[self.rows[index]][self.simbook[self.cols[index]]]
    def __len__(self):
        return len(self.data)

In [39]:
class IW(nn.Module):
    def __init__(self, book_num):
        super(IW, self).__init__()
        self.w = nn.Parameter(torch.empty(book_num, book_num))
        
#         nn.init.orthogonal_(self.w.weight)
    
    def forward(self, i, sim_book, bias_i, bias_j, predict, ground_truth, train=True):
        predict = predict - bias_j
        product = torch.zeros(sim_book.shape[0], dtype=torch.float32, device=device)
        predict = predict.to(torch.float32)
        for j in range(len(sim_book)):
            product[j] = torch.dot(self.w[i[j], sim_book[j]], predict[j])
        return ((bias_i + product - ground_truth) ** 2).sum()

In [9]:
model_train_set = dataset(sparse.load_npz('train_matrix.npz'), cosine_sim, bias)
model_test_set = dataset(sparse.load_npz('test_matrix.npz'), cosine_sim, bias)

train_loader = DataLoader(model_train_set, batch_size = 128, shuffle=True)
test_loader = DataLoader(model_test_set, batch_size = 128)

100%|██████████| 10000/10000 [00:05<00:00, 1763.08it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1768.10it/s]


In [47]:
from tqdm import tqdm 

model = IW(train_set.shape[1]).to(device)
model.load_state_dict(torch.load('IW_model.pkl', map_location=device))

optimizer = optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=2)
epoch_num = 50

min_loss = 1000

print("start training")
for epoch in range(epoch_num):
    model.train()
    loss_sum = 0.0
    iterator = tqdm(train_loader)
    for rxi, i, simbook, rxj, bxi, bxj in iterator:
        optimizer.zero_grad()
        rxi = rxi.to(device)
        i = i.to(device)
        simbook = simbook.to(device)
        rxj = rxj.to(device).float()
        bxi = bxi.to(device)
        bxj = bxj.to(device).float()
        loss = model(i.long(), simbook, bxi, bxj, rxj, rxi)
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
        iterator.set_postfix_str('loss={:^7.3f}, lr={:^7.4f}'.format(loss, optimizer.param_groups[0]["lr"]))
    
    model.eval()
    loss_avg = loss_sum / len(train_loader)
    print("Epoch:", epoch, "training loss:", loss_avg)
    
    valid_loss = 0
    iterator = tqdm(test_loader)
    for rxi, i, simbook, rxj, bxi, bxj in iterator:
        rxi = rxi.to(device)
        i = i.to(device)
        simbook = simbook.to(device)
        rxj = rxj.to(device)
        bxi = bxi.to(device)
        bxj = bxj.to(device)
        with torch.no_grad():
            valid_loss += model(i.long(), simbook, bxi, bxj, rxj, rxi)
    
    valid_avg = valid_loss / len(test_loader)
    print("Epoch:", epoch, "validation loss:", valid_avg)
    
    if valid_avg < min_loss:
        min_loss = valid_loss
        torch.save(model.state_dict(), 'IW_model.pkl')

  0%|          | 0/42023 [00:00<?, ?it/s]

start training


 29%|██▉       | 12323/42023 [1:53:59<4:36:38,  1.79it/s, loss=108.900, lr=0.0010 ]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 41%|████      | 17280/42023 [2:39:56<3:48:16,  1.81it/s, loss=107.428, lr=0.0010 ]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 53%|█████▎    | 22319/42023 [3:26:28<3:01:54,  1.81it/s, loss=128.232, lr=0.0010 ]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client i

Epoch: 0 training loss: 111.918788012993


100%|██████████| 4670/4670 [00:46<00:00, 99.52it/s] 


Epoch: 0 validation loss: tensor(138.4365, device='cuda:0', dtype=torch.float64)


  1%|          | 519/42023 [04:49<6:25:58,  1.79it/s, loss=97.973 , lr=0.0010 ]


KeyboardInterrupt: 

In [11]:
torch.save(model.state_dict(), 'IW_model.pkl')

In [48]:
import math
model = IW(train_set.shape[1]).to(device)
model.load_state_dict(torch.load('IW_model.pkl', map_location=device))
model.eval()
valid_loss = 0
iterator = tqdm(test_loader)
for rxi, i, simbook, rxj, bxi, bxj in iterator:
    rxi = rxi.to(device)
    i = i.to(device)
    simbook = simbook.to(device)
    rxj = rxj.to(device)
    bxi = bxi.to(device)
    bxj = bxj.to(device)
    with torch.no_grad():
        valid_loss += model(i.long(), simbook, bxi, bxj, rxj, rxi)
rmse = math.sqrt(valid_loss)
print("rmse = ", rmse)

100%|██████████| 4670/4670 [00:46<00:00, 99.48it/s] 

rmse =  804.0513647410631





In [4]:
import csv
import numpy as np
from collections import defaultdict
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

personal_rating = np.zeros((1, predict.shape[1]))
book_list = [1, 2, 4, 5, 1024, 2048, 4096, 8192]
for books in book_list:
    personal_rating[0][books] = 5
cosine_sim = cosine_similarity(personal_rating, predict)
print(cosine_sim.shape)
top_sim = (-cosine_sim).argsort()[:20]
top_rating = [predict[i] for i in top_sim]
print(top_rating)
top_mean = top_rating.mean(axis=0)
print(top_mean)

(1, 53424)
[[[5.0364695  4.9267977  4.02035965 ... 4.32978723 3.7008547  4.01      ]
  [4.94111829 4.90082773 3.94585828 ... 4.32978723 3.7008547  4.01      ]
  [4.92959934 4.92429346 3.9469804  ... 4.32978723 3.7008547  4.01      ]
  ...
  [4.16962864 4.25740855 3.16808728 ... 4.32978723 3.7008547  4.01      ]
  [3.37458481 2.9547447  2.3961622  ... 4.32978723 3.71854408 4.01      ]
  [3.70905273 3.81092423 2.6129528  ... 4.32978723 3.7008547  4.01      ]]]
[[5.0364695  4.9267977  4.02035965 ... 4.32978723 3.7008547  4.01      ]
 [4.94111829 4.90082773 3.94585828 ... 4.32978723 3.7008547  4.01      ]
 [4.92959934 4.92429346 3.9469804  ... 4.32978723 3.7008547  4.01      ]
 ...
 [4.16962864 4.25740855 3.16808728 ... 4.32978723 3.7008547  4.01      ]
 [3.37458481 2.9547447  2.3961622  ... 4.32978723 3.71854408 4.01      ]
 [3.70905273 3.81092423 2.6129528  ... 4.32978723 3.7008547  4.01      ]]


In [14]:
personal_rating = np.zeros((1, predict.shape[1]))
for books in book_list:
    personal_rating[0][books - 1] = 5
cos = cosine_similarity(personal_rating, original_data)
print(cos)
top_sim = (-cos).argsort()[:20]
top_rating = predict[top_sim].squeeze(0)
top_mean = top_rating.mean(axis=0)
recommendation_list = (-top_mean).argsort()[:20]
print("Recommended:")
for books in recommendation_list:
    print(books + 1, id_dict[books + 1])

[[0.04400208 0.09757142 0.05702144 ... 0.06503325 0.12511975 0.09648062]]
Recommended:


NameError: name 'id_dict' is not defined

In [9]:
print(top_rating.squeeze(0))

[[5.0364695  4.9267977  4.02035965 ... 4.32978723 3.7008547  4.01      ]
 [4.94111829 4.90082773 3.94585828 ... 4.32978723 3.7008547  4.01      ]
 [4.92959934 4.92429346 3.9469804  ... 4.32978723 3.7008547  4.01      ]
 ...
 [4.16962864 4.25740855 3.16808728 ... 4.32978723 3.7008547  4.01      ]
 [3.37458481 2.9547447  2.3961622  ... 4.32978723 3.71854408 4.01      ]
 [3.70905273 3.81092423 2.6129528  ... 4.32978723 3.7008547  4.01      ]]
