## Import packages

In [108]:
import os
import pandas as pd
import numpy as np
from keras.layers import Embedding, Reshape, Merge
from keras.models import Sequential
from keras.optimizers import Adamax
from keras.callbacks import EarlyStopping, ModelCheckpoint
from CFModel import CFModel

## Define constants


In [92]:
BASE_DIR = '.' # Modify this if needed to the local directory that the MovieLens 1M Dataset has been unzipped into. 
MOVIELENS_DIR = BASE_DIR + '/ml-1m/'
USER_FILE = 'users.dat'
MOVIE_FILE = 'movies.dat'
RATING_FILE = 'ratings.dat'
K_FACTORS = 20
RNG_SEED = 1446557
TEST_USER = 3000
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }

## Load MovieLens 1M data

The MovieLens 1M Dataset can be downloaded from http://files.grouplens.org/datasets/movielens/ml-1m.zip. We load the data about users, movies and ratings into dataframes. 

A few quirks in this dataset:

* The ids for users and movies are 1-based,
* not all movies have descriptions, and 
* not all movies are rated. 

To make it easy to simply use series from the ratings dataframe as training inputs and output to the Keras model, we do the following:

* We set max_userid to the maximum user id in the ratings,
* we set max_movieid to the maximum movie id in the ratings, and
* we add columns (user_emb_id and movie_emb_id) whose values are the user or movie ids minus one.

In [93]:
ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_FILE), 
                    sep='::', 
                    engine='python', 
                    names=['userid', 'movieid', 'rating', 'timestamp'])
max_userid = ratings['userid'].drop_duplicates().max()
max_movieid = ratings['movieid'].drop_duplicates().max()
ratings['user_emb_id'] = ratings['userid'] - 1
ratings['movie_emb_id'] = ratings['movieid'] - 1
print len(ratings), 'ratings loaded.'

1000209 ratings loaded.


In [94]:
users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_FILE), 
                    sep='::', 
                    engine='python', 
                    names=['userid', 'gender', 'age', 'occupation', 'zipcode'])
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print len(users), 'descriptions of', max_userid, 'users loaded.'

6040 descriptions of 6040 users loaded.


In [95]:
movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_FILE), 
                    sep='::', 
                    engine='python', 
                    names=['movieid', 'title', 'genre'])
print len(movies), 'descriptions of', max_movieid, 'movies loaded.'

3883 descriptions of 3952 movies loaded.


In [96]:
print len(ratings['userid'].drop_duplicates()), 'of the', max_userid, 'users rate at least one movie.'

6040 of the 6040 users rate at least one movie.


In [97]:
print len(ratings['movieid'].drop_duplicates()), 'of the', max_movieid, 'movies are rated.'

3706 of the 3952 movies are rated.


## Create training set

In [98]:
shuffled_ratings = ratings.sample(frac=1., random_state=RNG_SEED)
Users = shuffled_ratings['user_emb_id'].values
print 'Users:', Users, ', shape =', Users.shape
Movies = shuffled_ratings['movie_emb_id'].values
print 'Movies:', Movies, ', shape =', Movies.shape
Ratings = shuffled_ratings['rating'].values
print 'Ratings:', Ratings, ', shape =', Ratings.shape

Users: [4403 1819 2571 ...,  354 3510  831] , shape = (1000209,)
Movies: [2717 3692  149 ..., 2700 2920 3711] , shape = (1000209,)
Ratings: [5 4 4 ..., 2 4 4] , shape = (1000209,)


## Define model

In [9]:
model = CFModel(max_userid, max_movieid, K_FACTORS)

## Train model

In [10]:
callbacks = [EarlyStopping('val_loss', patience=2), ModelCheckpoint('movie_weights.h5', save_best_only=True)]
model.fit([Users, Movies], Ratings, nb_epoch=15, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 900188 samples, validate on 100021 samples
Epoch 1/15
574s - loss: 10.8678 - val_loss: 5.1730
Epoch 2/15
626s - loss: 3.1265 - val_loss: 2.0450
Epoch 3/15
607s - loss: 1.6241 - val_loss: 1.3690
Epoch 4/15
610s - loss: 1.2147 - val_loss: 1.1233
Epoch 5/15
608s - loss: 1.0466 - val_loss: 1.0085
Epoch 6/15
620s - loss: 0.9642 - val_loss: 0.9483
Epoch 7/15
621s - loss: 0.9191 - val_loss: 0.9148
Epoch 8/15
617s - loss: 0.8930 - val_loss: 0.8943
Epoch 9/15
610s - loss: 0.8765 - val_loss: 0.8805
Epoch 10/15
643s - loss: 0.8658 - val_loss: 0.8719
Epoch 11/15
767s - loss: 0.8580 - val_loss: 0.8667
Epoch 12/15
641s - loss: 0.8525 - val_loss: 0.8602
Epoch 13/15
628s - loss: 0.8475 - val_loss: 0.8556
Epoch 14/15
626s - loss: 0.8431 - val_loss: 0.8528
Epoch 15/15
621s - loss: 0.8388 - val_loss: 0.8482


<keras.callbacks.History at 0x1143d9850>

## Use model to make recommendations for a given user

In [109]:
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)

In [112]:
trained_model.load_weights('movie_weights.h5')

In [113]:
users[users['userid'] == TEST_USER][['userid', 'gender', 'age_desc', 'occ_desc', 'zipcode']]

Unnamed: 0,userid,gender,age_desc,occ_desc,zipcode
2999,3000,M,25-34,college/grad student,55408


In [118]:
def predict_rating(userid, movieid):
    return trained_model.rate(userid - 1, movieid - 1)

In [119]:
user_ratings = ratings[ratings['userid'] == TEST_USER][['userid', 'movieid', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movieid']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movieid', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(10)

Unnamed: 0,userid,movieid,rating,prediction,title,genre
0,3000,590,5,3.532443,Dances with Wolves (1990),Adventure|Drama|Western
1,3000,3552,5,3.498902,Caddyshack (1980),Comedy
2,3000,2858,5,3.89931,American Beauty (1999),Comedy|Drama
3,3000,3358,5,3.38424,Defending Your Life (1991),Comedy|Romance
4,3000,2968,5,3.309658,Time Bandits (1981),Adventure|Fantasy|Sci-Fi
5,3000,1307,5,3.673921,When Harry Met Sally... (1989),Comedy|Romance
6,3000,144,5,3.133694,"Brothers McMullen, The (1995)",Comedy
7,3000,1193,5,3.899977,One Flew Over the Cuckoo's Nest (1975),Drama
8,3000,1265,5,3.566049,Groundhog Day (1993),Comedy|Romance
9,3000,733,5,3.250357,"Rock, The (1996)",Action|Adventure|Thriller


In [120]:
recommendations = ratings[ratings['movieid'].isin(user_ratings['movieid']) == False][['movieid']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movieid']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movieid',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(10)

Unnamed: 0,movieid,prediction,title,genre
0,2905,4.133867,Sanjuro (1962),Action|Adventure
1,318,4.098945,"Shawshank Redemption, The (1994)",Drama
2,858,4.08676,"Godfather, The (1972)",Action|Crime|Drama
3,668,4.066997,Pather Panchali (1955),Drama
4,3022,4.058833,"General, The (1927)",Comedy
5,3030,4.055278,Yojimbo (1961),Comedy|Drama|Western
6,1178,4.045434,Paths of Glory (1957),Drama|War
7,922,4.045026,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Film-Noir
8,1148,4.04131,"Wrong Trousers, The (1993)",Animation|Comedy
9,50,4.040413,"Usual Suspects, The (1995)",Crime|Thriller
