In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import keras
from IPython.display import SVG
from keras.optimizers import Adam
from keras.utils.vis_utils import model_to_dot

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATAPATH = "data/"

In [3]:
dataset = pd.read_csv(DATAPATH+"cleaned_data_train.csv", names="user_id,movie_id,prediction".split(','))
dataset['user_id'] = dataset['user_id']-1
dataset['movie_id'] = dataset['movie_id']-1
dataset.head()

Unnamed: 0,user_id,movie_id,prediction
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.1)

In [5]:
n_users, n_movies = len(dataset.user_id.unique()), len(dataset.movie_id.unique())

In [50]:
n_latent_factors_user = 5
n_latent_factors_movie = 8

movie_input = keras.layers.Input(shape=[1],name='Item')
movie_embedding = keras.layers.Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding')(movie_input)
movie_vec = keras.layers.Flatten(name='FlattenMovies')(movie_embedding)
movie_vec = keras.layers.Dropout(0.2)(movie_vec)


user_input = keras.layers.Input(shape=[1],name='User')
user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding')(user_input))
user_vec = keras.layers.Dropout(0.2)(user_vec)


concat = keras.layers.concatenate([movie_vec, user_vec], name="Concat")
concat_dropout = keras.layers.Dropout(0.2)(concat)

dense = keras.layers.Dense(1000,name='FullyConnected')(concat)
dense = keras.layers.Dropout(0.2,name='Dropout')(dense)
dense_2 = keras.layers.Dense(100,name='FullyConnected-1')(concat)
#dropout_2 = keras.layers.Dropout(0.2,name='Dropout')(dense_2)
dense_3 = keras.layers.Dense(25,name='FullyConnected-2')(dense_2)
#dropout_3 = keras.layers.Dropout(0.2,name='Dropout')(dense_3)
dense_4 = keras.layers.Dense(25,name='FullyConnected-3')(dense_3)
#dropout_4 = keras.layers.Dropout(0.2,name='Dropout')(dense_4)
dense_5 = keras.layers.Dense(25,name='FullyConnected-4')(dense_4)
#dropout_5 = keras.layers.Dropout(0.2,name='Dropout')(dense_5)


result = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)
adam = Adam(lr=0.005)
model = keras.Model([user_input, movie_input], result)
model.compile(optimizer=adam,loss= 'mean_squared_error')

In [51]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 8)         8008        Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 5)         50005       User[0][0]                       
__________________________________________________________________________________________________
FlattenMov

In [55]:
history = model.fit([train.user_id, train.movie_id], train.prediction, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [56]:
y_hat = np.round(model.predict([test.user_id, test.movie_id]),0)
y_hat[y_hat < 1] = 1
y_hat[y_hat > 5] = 5
y_true = test.prediction

In [57]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_squared_error(y_true, y_hat)


# 4 layers (100, 50, 25, 10) without dropout: 1.094965
# 4 layers (100, 50,25, ..., 25, 10) without dropout: 

1.0946251359434476

In [11]:
dataset_to_predict = pd.read_csv(DATAPATH+"cleaned_sample.csv", names="user_id,movie_id,prediction".split(','))
dataset_to_predict ['user_id'] = dataset_to_predict ['user_id']-1
dataset_to_predict ['movie_id'] = dataset_to_predict ['movie_id']-1
dataset_to_predict .head()

Unnamed: 0,user_id,movie_id,prediction
0,36,0,3
1,72,0,3
2,155,0,3
3,159,0,3
4,247,0,3


In [12]:
predictions = np.round(model.predict([dataset_to_predict.user_id, dataset_to_predict.movie_id]),0)
predictions[predictions < 1] = 1
predictions[predictions > 5] = 5

In [13]:
dataset_to_predict["prediction"] = predictions.astype(int)

In [14]:
dataset_to_predict.head()

Unnamed: 0,user_id,movie_id,prediction
0,36,0,3
1,72,0,3
2,155,0,4
3,159,0,3
4,247,0,3


In [15]:
def create_submission():
    f = open(DATAPATH+"submission4.csv", "w")
    f.write("Id,Prediction\n")
    for _,d in dataset_to_predict.iterrows():
        text = 'r'+str(d[0]+1)+'_c'+str(d[1]+1)+','+str(d[2])+'\n'
        f.write(text)
    f.close()

In [16]:
#create_submission()