<a href="https://colab.research.google.com/github/YoheiFukuhara/recommender-system/blob/main/05_itembased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import datetime
import pickle

from sortedcontainers import SortedList
from google.colab import drive
import numpy as np

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
BASE_PATH = '/content/drive/MyDrive/ColabNotebooks/ML/Recommend/output/'

In [4]:
def load_pickle(file):
    with open(BASE_PATH+file+'.pickle', 'rb') as f:
        return pickle.load(f)

user2movie = load_pickle('user2movie')
movie2user = load_pickle('movie2user')
usermovie2rating = load_pickle('usermovie2rating')
usermovie2rating_test = load_pickle('usermovie2rating_test')

In [5]:
N = np.max(list(user2movie.keys())) + 1
m1 = np.max(list(movie2user.keys()))
m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])
M = max(m1, m2) + 1
print('N:', N, 'M:', M)  #if N is more than 10000, it takes so much time to process.

N: 4000 M: 1000


In [7]:
%%time
K = 20 # number of neighbors we'd like to consider
limit = 5 # number of common movies users must have in common in order to consider
neighbors = [] # store neighbors in this list
averages = [] # each user's average rating for later use
deviations = [] # each user's deviation for later use

def calc_movie_data(n):
    users_n_set = set(movie2user[n])

   # Movieの各rating
    ratings = { user: usermovie2rating[(user, n)] for user in movie2user[n] }
 
    # Movieの平均Rating
    avg = np.mean(list(ratings.values()))  

    # Movieの平均Rating と実Ratingとの差 の一覧
    dev = {user: (rating - avg) for user, rating in ratings.items()} 

    # Movieの平均Rating と実Ratingとの差 の一覧(User情報なし)
    dev_values = np.array(list(dev.values()))

    # ベクトル内積を計算することで偏差を出力
    # https://www.yukisako.xyz/entry/correlation-coefficient
    sigma = np.sqrt(dev_values.dot(dev_values))

    return users_n_set, avg, dev, sigma


for i in range(M):
    users_i_set, avg_i, dev_i, sigma_i = calc_movie_data(i)

    averages.append(avg_i)
    deviations.append(dev_i)

    sl = SortedList()

    # 対称的データなので、計算量を半分にできるが、半分にせずにアルゴリズムを簡略化
    for j in range(M):
        if j != i:
            common_users = (users_i_set & set(movie2user[j])) # 和集合
            if len(common_users) > limit:
                
                _, avg_j, dev_j, sigma_j = calc_movie_data(j)
                
                # 相関係数算出
                w_ij = sum(dev_i[m]*dev_j[m] for m in common_users) / (sigma_i * sigma_j)

                # insert into sorted list and truncate
                # negate weight, because list is sorted ascending
                # maximum value (1) is "closest"
                sl.add((-w_ij, j))

                # しきい値Kを超過していたら末尾(最低値)を削除
                if len(sl) > K:
                    del sl[-1]
    
    neighbors.append(sl)
    print("\r{0}".format(i), end="")

999CPU times: user 46min 17s, sys: 31.9 s, total: 46min 49s
Wall time: 46min 4s


In [8]:
def predict(i, u):
    # calculate the weighted sum of deviations
    numerator = 0
    denominator = 0
    
    # k近傍のj読込
    for neg_w, j in neighbors[i]:
        # remember, the weight is stored as its negative
        # so the negative of the negative weight is the positive weight
        try:
            numerator += -neg_w * deviations[j][u] # 分子
            denominator += abs(neg_w)
        except KeyError:
        # neighbor may not have been rated by the same user
        # don't want to do dictionary lookup twice
        # so just throw exception
            pass

    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = numerator / denominator + averages[i]
    prediction = min(5, prediction)
    prediction = max(0.5, prediction) # min rating is 0.5
    return prediction

In [9]:
%%time
train_predictions = []
train_targets = []
for (u, m), target in usermovie2rating.items():
    # calculate the prediction for this movie
    prediction = predict(m, u)

  # save the prediction and target
    train_predictions.append(prediction)
    train_targets.append(target)

CPU times: user 48.9 s, sys: 281 ms, total: 49.2 s
Wall time: 49.2 s


In [10]:
%%time
test_predictions = []
test_targets = []
# same thing for test set
for (u, m), target in usermovie2rating_test.items():
    # calculate the prediction for this movie
    prediction = predict(m, u)

  # save the prediction and target
    test_predictions.append(prediction)
    test_targets.append(target)

CPU times: user 12.3 s, sys: 60.7 ms, total: 12.3 s
Wall time: 12.3 s


In [11]:
# calculate accuracy
def mse(p, t):
    p = np.array(p)
    t = np.array(t)
    return np.mean((p - t)**2)

print('train mse:', mse(train_predictions, train_targets))
print('test mse:', mse(test_predictions, test_targets))

train mse: 0.5218819071439091
test mse: 0.5505804410197407
