# Model selection

In [1]:
import pandas as pd
import sys
import os
sys.path.append("../")
from definitions import ROOT_DIR
import numpy as np

In [2]:
data_folder = os.path.join(ROOT_DIR, 'data/interim')

In [4]:
data = pd.read_csv(os.path.join(data_folder, 'data_processed.csv'))
films = pd.read_csv(os.path.join(data_folder, 'films_processed.csv'))
users = pd.read_csv(os.path.join(data_folder, 'user_processed.csv'))

In [5]:
data.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp,user_emb_id,item_emb_id
0,196,242,3,881250949,195,241
1,186,302,3,891717742,185,301


In [6]:
films.head(2)

Unnamed: 0,movie_id,movie title,release_date,IMDb_URL,category
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,Animation|Children's|Comedy
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,Action|Adventure|Thriller


In [7]:
users.head(2)

Unnamed: 0,user_id,age,gender,occupation,zip_code,age_range,occ_desc
0,1,24,M,technician,85711,18-24,19
1,2,53,F,other,94043,50-55,13


In [9]:
shuffled_ratings = data.sample(frac=1., random_state=42)

Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

Movies = shuffled_ratings['item_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [876 814  93 ... 436 283 221] , shape = (100000,)
Movies: [380 601 430 ... 474 321 199] , shape = (100000,)
Ratings: [4 3 4 ... 3 3 3] , shape = (100000,)


In [12]:
max_userid = data['user_id'].drop_duplicates().max()
max_movieid = data['item_id'].drop_duplicates().max()

In [64]:
from tensorflow.keras.layers import Embedding, Reshape, Dot, Input
from tensorflow.keras.models import Model as KerasModel
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

class Model(KerasModel):
    def __init__(self, n_users, m_items, k_factors, **kwargs):
        # Input layers for user and item
        user_input = Input(shape=(1,), dtype='int32', name='user_input')
        item_input = Input(shape=(1,), dtype='int32', name='item_input')

        # Embedding layers for user and item
        P = Embedding(n_users, k_factors, input_length=1)(user_input)
        Q = Embedding(m_items, k_factors, input_length=1)(item_input)

        # Reshape layers
        P = Reshape((k_factors,))(P)
        Q = Reshape((k_factors,))(Q)

        # Dot product layer
        rating = Dot(axes=1)([P, Q])

        super(Model, self).__init__(inputs=[user_input, item_input], outputs=rating, **kwargs)

    def rate(self, users_id, items_id):
        return self.predict([np.array(users_id), np.array(items_id)], verbose=0)

In [67]:
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
model = Model(max_userid, max_movieid, K_FACTORS)
model.compile(loss='mse', optimizer='adamax')

In [69]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, epochs=40, validation_split=.05, verbose=2, callbacks=callbacks)

Epoch 1/40
2969/2969 - 1s - loss: 0.7795 - val_loss: 0.8871 - 1s/epoch - 343us/step
Epoch 2/40
2969/2969 - 1s - loss: 0.7717 - val_loss: 0.8822 - 1s/epoch - 337us/step
Epoch 3/40
2969/2969 - 1s - loss: 0.7639 - val_loss: 0.8782 - 984ms/epoch - 331us/step
Epoch 4/40
2969/2969 - 1s - loss: 0.7562 - val_loss: 0.8747 - 985ms/epoch - 332us/step
Epoch 5/40
2969/2969 - 1s - loss: 0.7485 - val_loss: 0.8722 - 983ms/epoch - 331us/step
Epoch 6/40
2969/2969 - 1s - loss: 0.7409 - val_loss: 0.8685 - 987ms/epoch - 332us/step
Epoch 7/40
2969/2969 - 1s - loss: 0.7334 - val_loss: 0.8657 - 986ms/epoch - 332us/step
Epoch 8/40
2969/2969 - 1s - loss: 0.7258 - val_loss: 0.8631 - 982ms/epoch - 331us/step
Epoch 9/40
2969/2969 - 1s - loss: 0.7182 - val_loss: 0.8612 - 984ms/epoch - 331us/step
Epoch 10/40
2969/2969 - 1s - loss: 0.7106 - val_loss: 0.8584 - 984ms/epoch - 331us/step
Epoch 11/40
2969/2969 - 1s - loss: 0.7030 - val_loss: 0.8551 - 992ms/epoch - 334us/step
Epoch 12/40
2969/2969 - 1s - loss: 0.6954 - val

In [70]:
import math

min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 30 = 0.9102


## Predict the ratings

In [71]:
trained_model = Model(max_userid, max_movieid, K_FACTORS)
trained_model.load_weights('weights.h5')

In [72]:
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [73]:
user_ratings = data[['user_id', 'item_id', 'rating']]
user_ids = data['user_id']
item_ids = data['item_id']
user_ratings['prediction'] = predict_rating(user_ids, item_ids)
user_ratings.sort_values(by='rating', 
                         ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings['prediction'] = predict_rating(user_ids, item_ids)


Unnamed: 0,user_id,item_id,rating,prediction
28336,64,183,5,4.254299
16000,213,121,5,4.066990
28426,286,707,5,4.633981
38655,94,518,5,4.522512
16022,108,10,5,4.112147
...,...,...,...,...
27329,13,401,1,1.897017
27316,405,788,1,1.149107
27307,279,1266,1,2.987078
85465,68,926,1,1.298860


In [75]:
user_ratings['prediction_rounded'] = user_ratings['prediction'].round().astype(int)
user_ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings['prediction_rounded'] = user_ratings['prediction'].round().astype(int)


Unnamed: 0,user_id,item_id,rating,prediction,prediction_rounded
0,196,242,3,3.980704,4
1,186,302,3,3.528082,4
2,22,377,1,1.730367,2
3,244,51,2,3.465878,3
4,166,346,1,3.234185,3
...,...,...,...,...,...
99995,880,476,3,3.085451,3
99996,716,204,5,4.278119,4
99997,276,1090,1,1.817253,2
99998,13,225,2,2.290304,2


In [77]:
(user_ratings['prediction_rounded'] == user_ratings['rating']).sum() / user_ratings.shape[0]

0.52603