In [1]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from utils.mylib import *
from d2l import torch as d2l
from math import sqrt

In [2]:
n, m = 1050, 2050
records = np.loadtxt('../data/ML100K/ML100K_copy1_train.txt', dtype='long')
records = records[:, :3]

R = np.zeros([n, m], dtype=np.int32)
for record in records:
    R[record[0], record[1]] = record[2]


In [3]:
y = np.where(R, 1, 0)
y_user = np.sum(y, axis=1)
y_item = np.sum(y, axis=0)

# Global average of rating
r = np.sum(R) / np.sum(y)
y_user = np.sum(y, axis=1)
r_u = np.where(y_user,
               np.sum(R, axis=1) / y_user,
               r)

  np.sum(R, axis=1) / y_user,


In [4]:
s_i = np.zeros([m, m])
for i in range(m):
    for j in range(m // 2):
        users = (R[:, i] != 0) * (R[:, j] != 0)
        if users.sum() == 0:
            continue
        vec_i = R[users, i] - r_u[users]
        vec_j = R[users, j] - r_u[users]

        dot = vec_i.dot(vec_j)
        mag_vec_i = sqrt(np.square(vec_i).sum())
        mag_vec_j = sqrt(np.square(vec_j).sum())
        if mag_vec_i == 0 or mag_vec_j == 0:
            continue
        similarity = dot / mag_vec_i / mag_vec_j

        s_i[i, j] = s_i[j, i] = similarity

In [5]:
s_i.min()

-1.0000000000000002

In [6]:
s_i.max()

1.0000000000000004

In [7]:
s_i.mean()

0.0031093084401356296

In [8]:
s_i.std()

0.40915964498082247

In [9]:
S = torch.from_numpy(s_i)

In [11]:
torch.save(S, "./similarity_100K.pt")

In [19]:
K = 20
N = np.empty((m, K))

In [20]:
for item in range(m):
    # neighbours = np.where(s_i[:, item] > 0)[0]
    neighbours = np.arange(m)
    # have zero neighbour which was rated by the user
    if len(neighbours) < K:
        continue

    # sort by similarity and select K of them
    neighbours_aug = np.vstack([neighbours, s_i[item, neighbours]])
    N[item] = neighbours[np.argsort(neighbours_aug[1, :])[-K:]]

In [24]:
torch.save(torch.LongTensor(N), "./N20_100K.pt")