# MLP model

### Set dataset and current working dir

In [1]:
!nvidia-smi
%cd /content
!git clone https://github.com/Brycexu727/movielens-dataset.git
%cd movielens-dataset

Sun Aug 11 13:09:36 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    16W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

### Import necessary modules

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from collections import defaultdict
from keras.layers import Embedding, Reshape, Activation
from keras.layers import Input, Dense, Flatten, Dropout
from keras.layers.merge import Dot, multiply, concatenate
from keras.models import Sequential, Model

Using TensorFlow backend.


### Set MLP parameters

In [0]:
#train_test_ratio=50 # eg. 50 stands for 50% training, 20% for testing
train_test_ratio=80 # eg. 80 stands for 80% for training, 20% for testing

epoch_size = 20

data_set_name = "ml-100k"
#data_set_name = "ml-1m"

### train and test

In [4]:
def MLP(item_num, user_num,hidden_feature=30):
    user_in = Input(shape=(1,), dtype='int32')
    user_embedding = Embedding(user_num+1, hidden_feature, name="user")(user_in)
    user_bis = Embedding(user_num+1, 1, name="userbias")(user_in)
    
    movie_in = Input(shape=(1,), dtype='int32')
    movie_embedding = Embedding(item_num+1, hidden_feature, name="movie")(movie_in)
    movie_bis = Embedding(item_num+1, 1, name="movie_bias")(movie_in)

    hidden_output = Dense(1)(Dense(20, activation="tanh")(Flatten()(concatenate([Dropout(0.5)(multiply([movie_embedding, user_embedding])), user_bis, movie_bis]))))

    m = Model(
        inputs=[movie_in, user_in], 
        outputs=hidden_output
    )
    
    m.compile(
        loss='mse', 
        optimizer='adam', 
        metrics=["mse"]
    )

    return m
def data_process_mlp(data_set_name, train_test_ratio=50):
    data_frame_title = [
        'userId', 
        'movieId', 
        'rating', 
        'timestamp'
        ]
    select = [
        "userId", 
        "movieId", 
        "rating"]
    if data_set_name == "ml-100k":
        data = pd.read_csv(
            './ml-100k/u.data', 
            sep ='\t', 
            names=data_frame_title)
    elif data_set_name == "ml-1m":
        data = pd.read_csv(
            './ml-1m/ratings.dat', 
            sep='::', 
            names=data_frame_title)
    else:
        print("unknown data set!")
    
    val = 1
    movies = {}
    dic1 = defaultdict(int)
    for e in data["movieId"]:
        dic1[e] += 1
    for e in dic1:
        movies[e] = val
        val += 1
    data["movieId"] = data["movieId"].map(movies)

    val = 1
    users = {}
    dic2 = defaultdict(int)
    for e in data["movieId"]:
        dic2[e] += 1
    for e in dic2:
        users[e] = val
        val += 1
   
    data["movieId"] = data["movieId"].map(users)

    split_part = np.percentile(data["timestamp"], train_test_ratio)
    
    test_data = data[data.timestamp>=split_part][select]
    train_data = data[data.timestamp<split_part][select]
 
    user_num = max(data["userId"].tolist())
    item_num = max(data["movieId"].tolist())

    return train_data, test_data, user_num, item_num, movies

train_data, test_data, user_num, item_num, movies =\
data_process_mlp(data_set_name)

model = MLP(item_num, user_num)

train = model.fit(
    [
        np.array([[e] for e in train_data["movieId"]]),
        np.array([[e] for e in train_data["userId"]])
    ],
    np.array([[e] for e in train_data["rating"]]),
    epochs=epoch_size,
    validation_split=0.2, 
    verbose=1
)

test = model.predict([
    np.array([[e] for e in test_data["movieId"]]),
    np.array([[e] for e in test_data["userId"]])
])

print("Test RMSE: %s" % np.sqrt(mean_squared_error(test_data["rating"], test)))

W0811 13:09:51.648156 140364200474496 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0811 13:09:51.661111 140364200474496 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0811 13:09:51.663580 140364200474496 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0811 13:09:51.700240 140364200474496 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0811 13:09:51.708598 

Train on 39998 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test RMSE: 1.073300044359594
