In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import Embedding, Reshape, Input
from keras.models import Model
from keras.layers import dot

Using TensorFlow backend.


In [2]:
rating_df = pd.read_csv("ml-latest-small/ratings.csv", usecols=['userId', 'movieId', 'rating'])
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
max_userid = rating_df['userId'].drop_duplicates().max()
max_movieid = rating_df['movieId'].drop_duplicates().max()

In [4]:
# Create training set
shuffled_ratings = rating_df.sample(frac=1., random_state=1)

# Shuffling users
Users = shuffled_ratings['userId'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movieId'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [225 533 125 ...  34 483 607] , shape = (100836,)
Movies: [   380 103688 149902 ...    318   8644   1213] , shape = (100836,)
Ratings: [3. 5. 2. ... 4. 4. 4.] , shape = (100836,)


In [5]:
movies_df = pd.read_csv("ml-latest-small/movies.csv", usecols=['movieId', 'title', 'genres'])
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df = pd.merge(movies_df, rating_df, on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [7]:
# Creating mixed ratings training set
mixed_ratings = rating_df.sample(frac=1., random_state=2)

In [8]:
# dimensional embeddings for movies and users
K_FACTORS = 100
TEST_USER = 200

### Creating Model
### Implementing matrix factorization for collaborative filtering using functional api since Sequential model can't be merged


In [9]:
input_1 = Input(shape=(1,))
input_2 = Input(shape=(1,))

P = Reshape((K_FACTORS,))(Embedding(max_userid+1, K_FACTORS, input_length=1)(input_1))
Q = Reshape((K_FACTORS,))(Embedding(max_movieid+1, K_FACTORS, input_length=1)(input_2))
P_dot_Q = dot([P, Q], axes = 1, normalize = True)


W0816 16:54:46.127563 140582424041280 deprecation_wrapper.py:119] From /home/bivav/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0816 16:54:46.274981 140582424041280 deprecation_wrapper.py:119] From /home/bivav/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0816 16:54:46.308229 140582424041280 deprecation_wrapper.py:119] From /home/bivav/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [10]:
model = Model(inputs=[input_1,input_2], outputs=P_dot_Q)
model.compile(loss='mse', optimizer='adamax', metrics=['accuracy'])

W0816 16:54:46.384781 140582424041280 deprecation_wrapper.py:119] From /home/bivav/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [None]:
# Monitoring the validation loss and saving the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), ModelCheckpoint('weights.h5', save_best_only=True)]

# Using 3 epochs and 10% validation data
history = model.fit([Users, Movies], Ratings, epochs=3, validation_split=.1, verbose=1, callbacks=callbacks)

In [None]:
# Best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

In [None]:
# Using the pre-trained model
trained_model = Model(inputs=[input_1,input_2], outputs=P_dot_Q)

trained_model.load_weights('weights.h5')

### Testing User

In [None]:
rating_df[rating_df['userId'] == TEST_USER].head()

### Function to predict the rating

In [None]:
def predict(user_id, item_id):
        return trained_model.predict([np.array([user_id])-1, np.array([item_id])-1])[0][0]

In [None]:
user_ratings = rating_df[rating_df['userId'] == TEST_USER][['userId', 'movieId', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict(TEST_USER, x['movieId']), axis=1)
user_ratings.sort_values(by='rating', ascending=False).merge(movies_df, on='movieId', 
                                                             how='inner', suffixes=['_u', '_m']).head()

In [None]:
recommendations = rating_df[rating_df['movieId'].isin(user_ratings['movieId']) == False][['movieId']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict(TEST_USER, x['movieId']), axis=1)
recommendations.sort_values(by='prediction', ascending=False).merge(movies_df, 
                                                                    on='movieId', how='inner', 
                                                                    suffixes=['_u', '_m']).head()