In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import pickle
f = open('../data/title.pickle','rb')
title_list = pickle.load(f)
f.close()
title = np.array(title_list)

### ratingの行列・辞書作成

In [3]:
rating_df = pd.read_csv('../data/u.data', sep='\t',
                        names=['user id', 'item id',
                               'rating', 'timestamp'])
id_rating = [[] for i in range(943)]
for i in range(len(rating_df)):
    id_rating[rating_df['user id'][i]-1]\
    += [(rating_df['item id'][i], rating_df['rating'][i])]

# 各userのratingを一つずつtest用にする
id_test_rating = [[] for i in range(943)]
for i in range(len(id_rating)):
    random_index = np.random.randint(len(id_rating[i]))
    id_test_rating[i] = [id_rating[i][random_index]]
    del id_rating[i][random_index]
    
id_rating_np = np.zeros((len(id_rating), 1682))
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        id_rating_np[i-1][id_rating[i][j][0]-1] = id_rating[i][j][1]

In [4]:
# 各movie id が training に１つ以上入っていない場合エラーを出力
I = id_rating_np > 0
for i in range(len(I[0])):
    assert I[:,i].sum() >= 1, 'test_split error'

In [5]:
rating_pair = {}
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        rating_pair[(i+1, id_rating[i][j][0])] =\
        id_rating[i][j][1]

In [6]:
title_test_pair = [{} for i in range(943)]
rating_test_pair = {}
for i in range(len(id_test_rating)):
    for j in range(len(id_test_rating[i])):
        rating_test_pair[(i+1, id_test_rating[i][j][0])] =\
        id_test_rating[i][j][1]
        title_test_pair[i][title[id_rating[i][j][0]-1]] =\
        id_rating[i][j][1]

### 実装

参考サイト
http://www.grappa.univ-lille3.fr/~mary/cours/stats/centrale/reco/paper/MatrixFactorizationALS.pdf

In [8]:
%time
# 条件設定
epochs = 5
noize_rate = 0.01
common_len = 5

# 初期値設定
u_len, m_len = id_rating_np.shape
I = id_rating_np > 0
u = np.random.uniform(0, 5, u_len*common_len).reshape(u_len, common_len)
m = np.random.uniform(0, 5, m_len*common_len).reshape(common_len, m_len)

training_len = len(rating_pair)
test_len = len(rating_test_pair)

# training
for epoch in range(epochs):
    for i in range(u_len):
        u[i] = np.dot(np.linalg.inv(np.dot(m[:,I[i]],
                                           m[:,I[i]].T) +
                                    noize_rate*sum(I[i])*np.eye(common_len)),
                      np.dot(m[:,I[i]],id_rating_np[i,I[i]].T)).T

    for j in range(m_len):
        m[:,j] = np.dot(np.linalg.inv(np.dot(u[I[:,j]].T,
                                             u[I[:,j]]) +
                                      noize_rate*sum(I[:,j])*np.eye(common_len)),
                        np.dot(u[I[:,j]].T,id_rating_np[I[:,j],j]))
    
    # predict
    pred = np.dot(u, m)

    # rmse
    loss = np.sqrt(np.sum((I * (id_rating_np - pred))**2)/training_len)

    # test_loss
    test_loss = 0
    for pair in rating_test_pair:
        true_rating = rating_test_pair[pair]
        test_loss += ((true_rating-pred[pair[0]-1]
                       [pair[1]-1])**2)/test_len

    print('epoch : {0:>3}, loss : {1:.7f}, test_loss : {2:.7f}'
          .format(epoch+1, loss, np.sqrt(test_loss)))

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 14.1 µs
epoch :   1, loss : 0.9154756, test_loss : 1.1948953
epoch :   2, loss : 0.8413301, test_loss : 1.2474239
epoch :   3, loss : 0.8199498, test_loss : 1.2867644
epoch :   4, loss : 0.8105117, test_loss : 1.3111826
epoch :   5, loss : 0.8051103, test_loss : 1.3270330


In [9]:
for pair in list(rating_test_pair.keys())[:15]:
    true_rating = rating_test_pair[pair]
    print('true_rating : {0:>2}, predict_rating : {1:.3f}'
          .format(true_rating, pred[pair[0]-1][pair[1]-1]))

true_rating :  3, predict_rating : 4.163
true_rating :  5, predict_rating : 2.873
true_rating :  4, predict_rating : 2.001
true_rating :  3, predict_rating : 3.668
true_rating :  2, predict_rating : 4.137
true_rating :  4, predict_rating : 3.538
true_rating :  3, predict_rating : 3.523
true_rating :  4, predict_rating : 2.909
true_rating :  2, predict_rating : 1.526
true_rating :  4, predict_rating : 3.642
true_rating :  3, predict_rating : 3.677
true_rating :  4, predict_rating : 3.981
true_rating :  4, predict_rating : 3.210
true_rating :  5, predict_rating : 3.929
true_rating :  4, predict_rating : 3.859


### training_dataに含まれていないオススメの映画を3つ表示

In [10]:
for i in range(5):
    print('user : {0}\n'.format(i+1))
    pred_index = np.delete(np.arange(m_len),
                           np.array(id_rating[i])[:,0]-1)
    print('test : {0}\nrecommend : {1}\n'
          .format(title_test_pair[i],
                  title[pred_index[pred[i][pred_index].argsort()[:-4:-1]]]))

user : 1

test : {'Grand Day Out, A (1992)': 3}
recommend : ['Angel Baby (1995)' 'Santa with Muscles (1996)'
 'Little Princess, The (1939)']

user : 2

test : {'Rosewood (1997)': 4}
recommend : ['Angel Baby (1995)' 'Stalker (1979)' 'Killer (Bulletproof Heart) (1994)']

user : 3

test : {'How to Be a Player (1997)': 1}
recommend : [ 'Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)'
 '8 Seconds (1994)' 'Stalker (1979)']

user : 4

test : {'Mimic (1997)': 3}
recommend : ['Hurricane Streets (1998)' 'Little City (1998)' 'Angel Baby (1995)']

user : 5

test : {'GoldenEye (1995)': 3}
recommend : ['Ciao, Professore! (1993)' 'Incognito (1997)' "Boy's Life 2 (1997)"]



一応test用のデータを表示しましたが、評価が高くないものに対しては参考にならないです。<br>
評価が高いものについても、疎行列のためうまくrecommendできていないようです。