In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import pickle
f = open('data/title.pickle','rb')
title_list = pickle.load(f)
f.close()
title = np.array(title_list)

### ratingの行列・辞書作成

In [3]:
rating_df = pd.read_csv('data/u.data', sep='\t',
                        names=['user id', 'item id',
                               'rating', 'timestamp'])
id_rating = [[] for i in range(943)]
for i in range(len(rating_df)):
    id_rating[rating_df['user id'][i]-1]\
    += [(rating_df['item id'][i], rating_df['rating'][i])]

# 各userのratingを一つずつtest用にする
id_test_rating = [[] for i in range(943)]
for i in range(len(id_rating)):
    random_index = np.random.randint(len(id_rating[i]))
    id_test_rating[i] = [id_rating[i][random_index]]
    del id_rating[i][random_index]
    
id_rating_np = np.zeros((len(id_rating), 1682))
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        id_rating_np[i-1][id_rating[i][j][0]-1] = id_rating[i][j][1]

In [4]:
# 各movie id が training に１つ以上入っていない場合エラーを出力
I = id_rating_np > 0
for i in range(len(I[0])):
    assert I[:,i].sum() >= 1, 'test_split error'

In [5]:
rating_pair = {}
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        rating_pair[(i+1, id_rating[i][j][0])] =\
        id_rating[i][j][1]

In [6]:
title_test_pair = [{} for i in range(943)]
rating_test_pair = {}
for i in range(len(id_test_rating)):
    for j in range(len(id_test_rating[i])):
        rating_test_pair[(i+1, id_test_rating[i][j][0])] =\
        id_test_rating[i][j][1]
        title_test_pair[i][title[id_rating[i][j][0]-1]] =\
        id_rating[i][j][1]

### 実装

参考サイト
http://www.grappa.univ-lille3.fr/~mary/cours/stats/centrale/reco/paper/MatrixFactorizationALS.pdf

In [7]:
# 条件設定
epochs = 5
noize_rate = 0.01
common_len = 5

# 初期値設定
u_len, m_len = id_rating_np.shape
I = id_rating_np > 0
u = np.random.uniform(0, 5, u_len*common_len).reshape(u_len, common_len)
m = np.random.uniform(0, 5, m_len*common_len).reshape(common_len, m_len)

training_len = len(rating_pair)
test_len = len(rating_test_pair)

# training
for epoch in range(epochs):
    for i in range(u_len):
        u[i] = np.dot(np.linalg.inv(np.dot(m[:,I[i]],
                                           m[:,I[i]].T) +
                                    noize_rate*sum(I[i])*np.eye(common_len)),
                      np.dot(m[:,I[i]],id_rating_np[i,I[i]].T)).T

    for j in range(m_len):
        m[:,j] = np.dot(np.linalg.inv(np.dot(u[I[:,j]].T,
                                             u[I[:,j]]) +
                                      noize_rate*sum(I[:,j])*np.eye(common_len)),
                        np.dot(u[I[:,j]].T,id_rating_np[I[:,j],j]))
    
    # predict
    pred = np.dot(u, m)

    # rmse
    loss = np.sqrt(np.sum((I * (id_rating_np - pred))**2)/training_len)

    # test_loss
    test_loss = 0
    for pair in rating_test_pair:
        true_rating = rating_test_pair[pair]
        test_loss += ((true_rating-pred[pair[0]-1]
                       [pair[1]-1])**2)/test_len

    print('epoch : {0:>3}, loss : {1:.7f}, test_loss : {2:.7f}'
          .format(epoch+1, np.sqrt(loss), np.sqrt(test_loss)))

epoch :   1, loss : 0.9562696, test_loss : 1.1397485
epoch :   2, loss : 0.9200680, test_loss : 1.2122342
epoch :   3, loss : 0.9080537, test_loss : 1.2319804
epoch :   4, loss : 0.9011994, test_loss : 1.2632399
epoch :   5, loss : 0.8971836, test_loss : 1.2862321


In [8]:
# test_loss
for pair in rating_test_pair:
    true_rating = rating_test_pair[pair]
    test_loss += [0.5*((true_rating - \
                        pred[pair[0]-1][pair[1]-1])**2)]
    print('true_rating : {0:>2}, predict_rating : {1:.3f}'
          .format(true_rating, pred[pair[0]-1][pair[1]-1]))

true_rating :  4, predict_rating : 2.078
true_rating :  5, predict_rating : 4.317
true_rating :  4, predict_rating : 2.751
true_rating :  5, predict_rating : 4.106
true_rating :  4, predict_rating : 2.836
true_rating :  1, predict_rating : 3.952
true_rating :  4, predict_rating : 3.096
true_rating :  4, predict_rating : 3.532
true_rating :  4, predict_rating : 3.154
true_rating :  2, predict_rating : 4.106
true_rating :  2, predict_rating : 0.450
true_rating :  3, predict_rating : 3.267
true_rating :  4, predict_rating : 4.293
true_rating :  3, predict_rating : 4.212
true_rating :  4, predict_rating : 4.653
true_rating :  4, predict_rating : 3.281
true_rating :  1, predict_rating : 3.493
true_rating :  4, predict_rating : 2.749
true_rating :  3, predict_rating : 4.066
true_rating :  4, predict_rating : 2.753
true_rating :  4, predict_rating : 3.155
true_rating :  5, predict_rating : 4.823
true_rating :  4, predict_rating : 2.617
true_rating :  4, predict_rating : 4.181
true_rating :  5

### training_dataに含まれていないオススメの映画を3つ表示

In [9]:
for i in range(u_len):
    print('user : {0}\n'.format(i+1))
    pred_index = np.delete(np.arange(m_len),
                           np.array(id_rating[i])[:,0]-1)
    print('test : {0}\nrecommend : {1}\n'
          .format(title_test_pair[i],
                  title[pred_index[pred[i][pred_index].argsort()[:-4:-1]]]))

user : 1

test : {'Three Colors: White (1994)': 4}
recommend : ['World of Apu, The (Apur Sansar) (1959)' 'Harlem (1993)'
 'Boys, Les (1997)']

user : 2

test : {'Rosewood (1997)': 4}
recommend : ['Schizopolis (1996)' 'Flirt (1995)' 'Lamerica (1994)']

user : 3

test : {'How to Be a Player (1997)': 1}
recommend : ['Angel Baby (1995)' 'Love and Death on Long Island (1997)' 'Safe (1995)']

user : 4

test : {'Mimic (1997)': 3}
recommend : ['Love and Death on Long Island (1997)' 'Angel Baby (1995)'
 'Aparajito (1956)']

user : 5

test : {'GoldenEye (1995)': 3}
recommend : ['Angel Baby (1995)' 'City of Industry (1997)' 'Mina Tannenbaum (1994)']

user : 6

test : {'Remains of the Day, The (1993)': 3}
recommend : ['Harlem (1993)' 'Boys, Les (1997)'
 'World of Apu, The (Apur Sansar) (1959)']

user : 7

test : {'Crumb (1994)': 4}
recommend : ['World of Apu, The (Apur Sansar) (1959)' 'Hugo Pool (1997)'
 'Faster Pussycat! Kill! Kill! (1965)']

user : 8

test : {'Bean (1997)': 4}
recommend : ['Hote

一応test用のデータを表示しましたが、評価が高くないものに対しては参考にならないです。<br>
評価が高いものについても、疎行列のためうまくrecommendできていないようです。