# Load & Explore data

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('rating_train.csv')
dataset.head()

Unnamed: 0,UserID,MovieID,timestamps,Rating
0,1,1836,978300172,5
1,1,1097,978301953,4
2,1,2028,978301619,5
3,1,527,978824195,5
4,1,2918,978302124,4


unique ID

In [3]:
len(dataset.UserID.unique()), len(dataset.MovieID.unique())

(6040, 3672)

We assign a unique number between (0, #users) to each user and do the same for movies.
And store the mapping function

In [4]:
u_d = dict(enumerate(dataset.UserID.astype('category').cat.categories))
u_d = {v: k for k, v in u_d.items()}
m_d = dict(enumerate(dataset.MovieID.astype('category').cat.categories))
m_d = {v: k for k, v in m_d.items()}

In [5]:
dataset.UserID = dataset.UserID.astype('category').cat.codes.values
dataset.MovieID = dataset.MovieID.astype('category').cat.codes.values

In [6]:
dataset.head()

Unnamed: 0,UserID,MovieID,timestamps,Rating
0,0,1639,978300172,5
1,0,1009,978301953,4
2,0,1828,978301619,5
3,0,512,978824195,5
4,0,2685,978302124,4


split the data into train and test set for cross validation

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.2)

In [8]:
train.shape

(640154, 4)

In [9]:
test.shape

(160039, 4)

# Create model

In [10]:
from keras.layers import Embedding, Reshape, Dot, Dropout, Dense,Input
from keras.models import Sequential
from keras import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
user_num = len(dataset.UserID.unique())
item_num = len(dataset.MovieID.unique())

embed_dim = 5

movie_input = Input(shape=[1])
movie_embedding = Embedding(item_num, embed_dim)(movie_input)
movie_vec = Reshape((embed_dim,))(movie_embedding)

user_input = Input(shape=[1])
user_embedding = Embedding(user_num, embed_dim)(user_input)
user_vec = Reshape((embed_dim,))(user_embedding)

dot = Dot(axes = 1)([movie_vec,user_vec])

model = Model([user_input, movie_input], dot)
model.compile('adam', 'mean_squared_error')

In [12]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 5)         18360       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 5)         30200       input_2[0][0]                    
__________________________________________________________________________________________________
reshape_1 

In [13]:
history = model.fit([train.UserID, train.MovieID], train.Rating, epochs=20,
                    validation_data=([test.UserID,test.MovieID], test.Rating))

Train on 640154 samples, validate on 160039 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Load test data & Predict

In [14]:
def get_value(k,d):
    # meet unknow user or movie, just assign a random user
    if k not in d.keys():
        return 0
    return d[k]

In [15]:
dataset_test = pd.read_csv('rating_test.csv')

Mapping using the same dictionary

In [16]:
dataset_test['UserID'] = dataset_test['UserID'].apply(get_value,d = u_d)
dataset_test['MovieID'] = dataset_test['MovieID'].apply(get_value,d = m_d)

In [17]:
dataset_test.head()

Unnamed: 0,UserID,MovieID,timestamps
0,0,838,978301968
1,0,1818,978301777
2,0,2568,978302039
3,0,1162,978300055
4,0,1405,978824139


In [18]:
prediction = model.predict([dataset_test['UserID'], dataset_test['MovieID']])
res = pd.read_csv('rating_test.csv')
res['Rating'] = prediction

In [19]:
res.head()

Unnamed: 0,UserID,MovieID,timestamps,Rating
0,1,914,978301968,4.548431
1,1,2018,978301777,4.320057
2,1,2797,978302039,4.796607
3,1,1270,978300055,4.779355
4,1,1545,978824139,4.231502


In [20]:
res.to_csv('Q8_output.csv', columns = ['UserID','MovieID','Rating'] ,index  = False)