## ALS_WRを用いた推定

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import pickle
f = open('../data/title.pickle','rb')
title_list = pickle.load(f)
f.close()
title = np.array(title_list)

### ratingの行列・辞書作成

In [3]:
rating_df = pd.read_csv('../data/u.data', sep='\t',
                        names=['user id', 'item id',
                               'rating', 'timestamp'])
id_rating = [[] for i in range(943)]
for i in range(len(rating_df)):
    id_rating[rating_df['user id'][i]-1]\
    += [(rating_df['item id'][i], rating_df['rating'][i])]

# 各userのratingを一つずつtest用にする
id_test_rating = [[] for i in range(943)]
for i in range(len(id_rating)):
    random_index = np.random.randint(len(id_rating[i]))
    id_test_rating[i] = [id_rating[i][random_index]]
    del id_rating[i][random_index]
    
id_rating_np = np.zeros((len(id_rating), 1682))
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        id_rating_np[i-1][id_rating[i][j][0]-1] = id_rating[i][j][1]

In [4]:
# 各movie id が training に１つ以上入っていない場合エラーを出力
I = id_rating_np > 0
for i in range(len(I[0])):
    assert I[:,i].sum() >= 1, 'test_split error'

In [5]:
rating_pair = {}
for i in range(len(id_rating)):
    for j in range(len(id_rating[i])):
        rating_pair[(i+1, id_rating[i][j][0])] =\
        id_rating[i][j][1]

In [6]:
title_test_pair = [{} for i in range(943)]
rating_test_pair = {}
for i in range(len(id_test_rating)):
    for j in range(len(id_test_rating[i])):
        rating_test_pair[(i+1, id_test_rating[i][j][0])] =\
        id_test_rating[i][j][1]
        title_test_pair[i][title[id_rating[i][j][0]-1]] =\
        id_rating[i][j][1]

### 実装

参考サイト
http://www.grappa.univ-lille3.fr/~mary/cours/stats/centrale/reco/paper/MatrixFactorizationALS.pdf

In [7]:
%time
# 条件設定
epochs = 5
noize_rate = 0.01
common_len = 5

# 初期値設定
u_len, m_len = id_rating_np.shape
I = id_rating_np > 0
u = np.empty((u_len, common_len))
m = np.random.uniform(0, 5, m_len*common_len).reshape(common_len, m_len)
m[0] = np.sum(id_rating_np,axis=0)/np.sum(I,axis=0)
training_len = len(rating_pair)
test_len = len(rating_test_pair)

# training
for epoch in range(epochs):
    for i in range(u_len):
        u[i] = np.dot(np.linalg.inv(np.dot(m[:,I[i]],
                                           m[:,I[i]].T) +
                                    noize_rate*sum(I[i])*np.eye(common_len)),
                      np.dot(m[:,I[i]],id_rating_np[i,I[i]].T)).T
    u[u<0] = 0
    for j in range(m_len):
        m[:,j] = np.dot(np.linalg.inv(np.dot(u[I[:,j]].T,
                                             u[I[:,j]]) +
                                      noize_rate*sum(I[:,j])*np.eye(common_len)),
                        np.dot(u[I[:,j]].T,id_rating_np[I[:,j],j]))
    m[m<0] = 0
    # predict
    pred = np.dot(u, m)

    # rmse
    loss = np.sqrt(np.sum((I * (id_rating_np - pred))**2)/training_len)

    # test_loss
    test_loss = 0
    for pair in rating_test_pair:
        true_rating = rating_test_pair[pair]
        test_loss += ((true_rating-pred[pair[0]-1]
                       [pair[1]-1])**2)/test_len

    print('epoch : {0:>3}, loss : {1:.7f}, test_loss : {2:.7f}'
          .format(epoch+1, loss, np.sqrt(test_loss)))

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs
epoch :   1, loss : 0.9528019, test_loss : 1.1845386
epoch :   2, loss : 0.8794842, test_loss : 1.1975379
epoch :   3, loss : 0.8542239, test_loss : 1.1904117
epoch :   4, loss : 0.8428345, test_loss : 1.1887213
epoch :   5, loss : 0.8351463, test_loss : 1.1886822


In [8]:
for pair in list(rating_test_pair.keys())[:15]:
    true_rating = rating_test_pair[pair]
    print('true_rating : {0:>2}, predict_rating : {1:.3f}'
          .format(true_rating, pred[pair[0]-1][pair[1]-1]))

true_rating :  5, predict_rating : 3.226
true_rating :  5, predict_rating : 0.861
true_rating :  3, predict_rating : 3.826
true_rating :  5, predict_rating : 3.650
true_rating :  2, predict_rating : 3.203
true_rating :  4, predict_rating : 4.023
true_rating :  3, predict_rating : 2.327
true_rating :  5, predict_rating : 2.430
true_rating :  4, predict_rating : 4.160
true_rating :  1, predict_rating : 2.627
true_rating :  4, predict_rating : 4.403
true_rating :  1, predict_rating : 4.012
true_rating :  1, predict_rating : 1.641
true_rating :  4, predict_rating : 3.477
true_rating :  4, predict_rating : 2.918


### training_dataに含まれていないオススメの映画を3つ表示

In [9]:
for i in range(5):
    print('user : {0}\n'.format(i+1))
    pred_index = np.delete(np.arange(m_len),
                           np.array(id_rating[i])[:,0]-1)
    print('test : {0}\nrecommend : {1}\n'
          .format(title_test_pair[i],
                  title[pred_index[pred[i][pred_index].argsort()[:-4:-1]]]))

user : 1

test : {'Three Colors: White (1994)': 4}
recommend : ['American Dream (1990)' 'Leading Man, The (1996)' 'Mina Tannenbaum (1994)']

user : 2

test : {'Rosewood (1997)': 4}
recommend : ['Angel Baby (1995)' 'American Dream (1990)' 'Hearts and Minds (1996)']

user : 3

test : {'How to Be a Player (1997)': 1}
recommend : ['Leading Man, The (1996)' 'Swept from the Sea (1997)' 'Hugo Pool (1997)']

user : 4

test : {'Mimic (1997)': 3}
recommend : ['American Dream (1990)' 'Angel Baby (1995)' 'Leading Man, The (1996)']

user : 5

test : {'GoldenEye (1995)': 3}
recommend : ['American Dream (1990)' 'Angel Baby (1995)' 'Leading Man, The (1996)']



一応test用のデータを表示しましたが、評価が高くないものに対しては参考にならないです。<br>
評価が高いものについても、疎行列のためうまくrecommendできていないようです。