In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import pickle
from scipy.sparse import linalg, lil_matrix, csr_matrix

In [2]:
f = open('data/title.pickle','rb')
title_list = pickle.load(f)
f.close()
title = np.array(title_list)

### ratingの行列・辞書作成

In [3]:
rating_df = pd.read_csv('data/u.data', sep='\t',
                        names=['user id', 'item id',
                               'rating', 'timestamp'])
id_rating = [[] for i in range(943)]
for i in range(len(rating_df)):
    id_rating[rating_df['user id'][i]-1]\
    += [(rating_df['item id'][i], rating_df['rating'][i])]

# 各userのratingを一つずつtest用にする
id_test_rating = [[] for i in range(943)]
for i in range(len(id_rating)):
    random_index = np.random.randint(len(id_rating[i]))
    id_test_rating[i] = [id_rating[i][random_index]]
    del id_rating[i][random_index]
    
id_rating_np = np.zeros((len(id_rating), 1682))
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        id_rating_np[i-1][id_rating[i][j][0]-1] = id_rating[i][j][1]

In [4]:
# 各movie id が training に１つ以上入っていない場合エラーを出力
I = id_rating_np > 0
for i in range(len(I[0])):
    assert I[:,i].sum() >= 1, 'test_split error'
id_rating_lil = lil_matrix(id_rating_np)
del id_rating_np

In [5]:
rating_pair = {}
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        rating_pair[(i+1, id_rating[i][j][0])] =\
        id_rating[i][j][1]

In [6]:
title_test_pair = [{} for i in range(943)]
rating_test_pair = {}
for i in range(len(id_test_rating)):
    for j in range(len(id_test_rating[i])):
        rating_test_pair[(i+1, id_test_rating[i][j][0])] =\
        id_test_rating[i][j][1]
        title_test_pair[i][title[id_rating[i][j][0]-1]] =\
        id_rating[i][j][1]

### 実装


参考サイト
http://www.grappa.univ-lille3.fr/~mary/cours/stats/centrale/reco/paper/MatrixFactorizationALS.pdf

In [7]:
# 条件設定
epochs = 5
noize_rate = 0.01
common_len = 5

# 初期値設定
u_len, m_len = id_rating_lil.shape
I = id_rating_lil > 0
u = np.random.uniform(0, 5, u_len*common_len).reshape(u_len, common_len)
m = np.random.uniform(0, 5, m_len*common_len).reshape(common_len, m_len)
u = lil_matrix(u).tocsr()
m = lil_matrix(m).tocsc()
training_len = len(rating_pair)
test_len = len(rating_test_pair)

# training
for epoch in range(epochs):
    for i in range(u_len):
        I_i = np.array(I.getrow(i).todense())[0]
        u[i] = (linalg.inv(m[:,I_i].dot(m[:,I_i].T)+
                                  noize_rate*I[i].sum()*
                                  sparse.eye(common_len))).dot(
            m[:,I_i].dot(id_rating_lil[i,I_i].transpose())).transpose()

    for j in range(m_len):
        I_j = np.array(I.transpose().getrow(j).todense())[0]
        m[:,j] = (linalg.inv(u[I_j].T.dot(u[I_j]) +
                                      noize_rate*I[:,j].sum()*
                                    sparse.eye(common_len))).dot(
                        u[I_j].transpose().dot(id_rating_lil[I_j,j]).todense())
    # predict
    pred = u.dot(m)
    
    # rmse
    loss = np.sqrt((np.power((id_rating_lil - pred)[I],2)).sum()/training_len)

    # test_loss
    test_loss = 0
    for pair in rating_test_pair:
        true_rating = rating_test_pair[pair]
        test_loss += ((true_rating-pred[pair[0]-1,pair[1]-1])**2)/test_len

    print('epoch : {0:>3}, loss : {1:.7f}, test_loss : {2:.7f}'\
          .format(epoch+1, loss, np.sqrt(float(test_loss))))

epoch :   1, loss : 0.9170484, test_loss : 1.1764875
epoch :   2, loss : 0.8485681, test_loss : 1.2464507
epoch :   3, loss : 0.8257378, test_loss : 1.3016431
epoch :   4, loss : 0.8140628, test_loss : 1.3180568
epoch :   5, loss : 0.8078590, test_loss : 1.3107776


In [8]:
for pair in list(rating_test_pair.keys())[:15]:
    true_rating = rating_test_pair[pair]
    print('true_rating : {0:>2}, predict_rating : {1:.3f}'
          .format(true_rating, pred[pair[0]-1,pair[1]-1]))

true_rating :  3, predict_rating : 3.865
true_rating :  3, predict_rating : 3.641
true_rating :  1, predict_rating : 1.392
true_rating :  4, predict_rating : 4.059
true_rating :  3, predict_rating : 3.272
true_rating :  4, predict_rating : 4.029
true_rating :  2, predict_rating : 2.936
true_rating :  4, predict_rating : 3.726
true_rating :  4, predict_rating : 3.888
true_rating :  1, predict_rating : 3.223
true_rating :  2, predict_rating : 2.417
true_rating :  4, predict_rating : 2.894
true_rating :  4, predict_rating : 3.998
true_rating :  3, predict_rating : 3.709
true_rating :  5, predict_rating : 2.588


### u, mの保存

In [9]:
sparse.save_npz("output/u.npz",u)
sparse.save_npz("output/m.npz",m)