## 读取MovieLens 数据

In [2]:
import pandas
training_set = pandas.read_csv('ml-100k/u1.base', sep = '\t', names = ['userid', 'itemid', 'rating', 'tm'])
training_set.head()

Unnamed: 0,userid,itemid,rating,tm
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [3]:
import pandas
test_set = pandas.read_csv('ml-100k/u1.test', sep = '\t', names = ['userid', 'itemid', 'rating', 'tm'])
test_set.head()

Unnamed: 0,userid,itemid,rating,tm
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


## 计算电影与使用者数量

In [4]:
n_movies = max(max(training_set.itemid.tolist()), max(test_set.itemid.tolist()))
n_movies

1682

In [5]:
n_users= max(max(training_set.userid.tolist()), max(test_set.userid.tolist()))
n_users

943

## 建立训练数据集矩阵

In [6]:
import numpy as np
training_m = np.zeros((n_users, n_movies))
for rec in training_set.iterrows():
    training_m[rec[1].userid - 1 , rec[1].itemid - 1] = 1

In [7]:
training_m.shape

(943, 1682)

In [8]:
training_m

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

## 建立测试数据集矩阵

In [9]:
import numpy as np
test_m = np.zeros((n_users, n_movies))
for rec in test_set.iterrows():
    test_m[rec[1].userid - 1 , rec[1].itemid - 1] = 1

In [10]:
test_m.shape

(943, 1682)

In [11]:
test_m

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 建立Autoencoders

In [12]:
from keras.layers import Input, Dense
from keras.models import Model

encoding_dim = 50  
input_data = Input(shape=(n_users,))
encoded = Dense(encoding_dim, activation='softmax')(input_data)
decoded = Dense(n_users)(encoded)

autoencoder = Model(input_data, decoded)
autoencoder.compile(optimizer='adam', loss='mean_absolute_error')

Using TensorFlow backend.


## 训练Autoencoders

In [13]:
autoencoder.fit(training_m.T, training_m.T,
                epochs=100,
                batch_size=32,
                shuffle=True,
                validation_data=(test_m.T, test_m.T) )

Train on 1682 samples, validate on 1682 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0xb255ce898>

## 建立 Encoder

In [17]:
encoder = Model(input_data, encoded)

In [18]:
encoded_data = encoder.predict(training_m.T)

In [19]:
encoded_data.shape

(1682, 50)

## 建立 Decoder

In [20]:
encoded_input = Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))

In [21]:
pred = decoder.predict(encoded_data)

In [22]:
pred.shape

(1682, 943)

## 检视原始与重建数据

In [23]:
#pred
(pred[:,0] > 0.001).astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
sum((pred[:,0] > 0.001).astype(int) == training_m.T[:,0]) / len(training_m.T[:,0])

0.9197384066587396

In [25]:
((pred > 0.001).astype(int) == training_m.T).sum() / (1682 * 943)

0.9465799060099891

In [26]:
pred[ training_m.T == 0] = 0
mse = np.mean(np.power(training_m.T.flatten() - pred.flatten(), 2))
mse

0.044705959974219195

## 套用到测试数据集

In [27]:
encoded_test_data = encoder.predict(test_m.T)

In [28]:
pred_test = decoder.predict(encoded_test_data)

In [29]:
((pred_test > 0.001).astype(int) == test_m.T).sum() / (1682 * 943)

0.963425982551197

In [30]:
pred_test[ test_m.T == 0] = 0
mse = np.mean(np.power(test_m.T.flatten() - pred_test.flatten(), 2))
mse

0.012590618234723053

In [33]:
!jupyter nbconvert --to script movieRecommend.py


Traceback (most recent call last):
  File "/Users/GuoYu/miniconda3/envs/env/bin/jupyter-nbconvert", line 11, in <module>
    sys.exit(main())
  File "/Users/GuoYu/miniconda3/envs/env/lib/python3.6/site-packages/jupyter_core/application.py", line 266, in launch_instance
    return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
  File "/Users/GuoYu/miniconda3/envs/env/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/GuoYu/miniconda3/envs/env/lib/python3.6/site-packages/nbconvert/nbconvertapp.py", line 325, in start
    self.convert_notebooks()
  File "/Users/GuoYu/miniconda3/envs/env/lib/python3.6/site-packages/nbconvert/nbconvertapp.py", line 483, in convert_notebooks
    self.exporter = cls(config=self.config)
  File "/Users/GuoYu/miniconda3/envs/env/lib/python3.6/site-packages/nbconvert/exporters/templateexporter.py", line 243, in __init__
    super(TemplateExporter, self).__init__(config