In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import pickle
from scipy.sparse import linalg, lil_matrix, csr_matrix

In [3]:
f = open('../data/title.pickle','rb')
title_list = pickle.load(f)
f.close()
title = np.array(title_list)

### ratingの行列・辞書作成

In [4]:
rating_df = pd.read_csv('../data/u.data', sep='\t',
                        names=['user id', 'item id',
                               'rating', 'timestamp'])
id_rating = [[] for i in range(943)]
for i in range(len(rating_df)):
    id_rating[rating_df['user id'][i]-1]\
    += [(rating_df['item id'][i], rating_df['rating'][i])]

# 各userのratingを一つずつtest用にする
id_test_rating = [[] for i in range(943)]
for i in range(len(id_rating)):
    random_index = np.random.randint(len(id_rating[i]))
    id_test_rating[i] = [id_rating[i][random_index]]
    del id_rating[i][random_index]
    
id_rating_np = np.zeros((len(id_rating), 1682))
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        id_rating_np[i-1][id_rating[i][j][0]-1] = id_rating[i][j][1]

In [5]:
# 各movie id が training に１つ以上入っていない場合エラーを出力
I = id_rating_np > 0
for i in range(len(I[0])):
    assert I[:,i].sum() >= 1, 'test_split error'
id_rating_lil = lil_matrix(id_rating_np)
del id_rating_np

In [6]:
rating_pair = {}
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        rating_pair[(i+1, id_rating[i][j][0])] =\
        id_rating[i][j][1]

In [7]:
title_test_pair = [{} for i in range(943)]
rating_test_pair = {}
for i in range(len(id_test_rating)):
    for j in range(len(id_test_rating[i])):
        rating_test_pair[(i+1, id_test_rating[i][j][0])] =\
        id_test_rating[i][j][1]
        title_test_pair[i][title[id_rating[i][j][0]-1]] =\
        id_rating[i][j][1]

### 実装


参考サイト
http://www.grappa.univ-lille3.fr/~mary/cours/stats/centrale/reco/paper/MatrixFactorizationALS.pdf

In [8]:
# 条件設定
epochs = 5
noize_rate = 0.01
common_len = 5

# 初期値設定
u_len, m_len = id_rating_lil.shape
I = id_rating_lil > 0
u = np.random.uniform(0, 5, u_len*common_len).reshape(u_len, common_len)
m = np.random.uniform(0, 5, m_len*common_len).reshape(common_len, m_len)

training_len = len(rating_pair)
test_len = len(rating_test_pair)

for epoch in range(epochs):
    # training
    for i in range(u_len):
            I_i = np.array(I.getrow(i).todense())[0]
            u[i] = np.dot(np.linalg.inv(np.dot(m[:, I_i],m[:, I_i].T) +
                               noize_rate * I[i].sum() *
                               np.eye(common_len)),
                np.dot(m[:, I_i],id_rating_lil[i, I_i].transpose().todense())).transpose()

    for j in range(m_len):
            I_j = np.array(I.transpose().getrow(j).todense())[0]
            m[:, j] = np.squeeze(np.array(np.dot(np.linalg.inv(np.dot(u[I_j].T,u[I_j]) +
                                  noize_rate * I[:, j].sum() *
                                  np.eye(common_len)),
                np.dot(u[I_j].transpose(),id_rating_lil[I_j, j].todense()))))
        
    # predict
    pred = np.dot(u, m)

    # rmse
    loss = np.sqrt((np.power((id_rating_lil - pred)[np.array(I.todense())]
                             , 2).sum() / training_len))

    # test_loss
    test_loss = 0
    for pair in rating_test_pair:
        true_rating = rating_test_pair[pair]
        test_loss += ((true_rating -
                       pred[pair[0] - 1, pair[1] - 1])**2) / test_len

    print('epoch : {0:>3}, loss : {1:.7f}, test_loss : {2:.7f}'
              .format(epoch + 1, loss, np.sqrt(float(test_loss))))

epoch :   1, loss : 0.9151854, test_loss : 1.1645950
epoch :   2, loss : 0.8467475, test_loss : 1.2153937
epoch :   3, loss : 0.8219495, test_loss : 1.2728493
epoch :   4, loss : 0.8092257, test_loss : 1.3206034
epoch :   5, loss : 0.8016830, test_loss : 1.3313299


In [9]:
for pair in list(rating_test_pair.keys())[:15]:
    true_rating = rating_test_pair[pair]
    print('true_rating : {0:>2}, predict_rating : {1:.3f}'
          .format(true_rating, pred[pair[0]-1,pair[1]-1]))

true_rating :  3, predict_rating : 3.695
true_rating :  4, predict_rating : 3.214
true_rating :  4, predict_rating : 3.069
true_rating :  3, predict_rating : 2.860
true_rating :  4, predict_rating : 2.989
true_rating :  5, predict_rating : 3.468
true_rating :  2, predict_rating : 3.991
true_rating :  4, predict_rating : 4.488
true_rating :  2, predict_rating : 3.022
true_rating :  3, predict_rating : 1.925
true_rating :  4, predict_rating : 3.650
true_rating :  4, predict_rating : 3.662
true_rating :  4, predict_rating : 4.355
true_rating :  3, predict_rating : 3.799
true_rating :  1, predict_rating : 2.459


### u, mの保存

In [9]:
sparse.save_npz("output/u.npz",u)
sparse.save_npz("output/m.npz",m)