# Movie recommendation on Amazon SageMaker with Neural Networks and MXNet

### Step SM1: Download ml-100k data  
***The data sets are needed to train our Neural Network. We use the 100,000 movie ratings given by users from MovieLens data sets.***

#####  The data sets are needed to train our Factorization Machine. We use the 100,000 movie ratings given by users from MovieLens data sets.

In [24]:
!pip install pydot



In [2]:
import os
import tensorflow as tf
#from mxnet import gluon, nd, ndarray

import pandas as pd
import numpy as np




In [3]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()
s3_bucket = sess.default_bucket()
s3_prefix = 'movielens'

arn:aws:iam::547292691214:role/service-role/AmazonSageMaker-ExecutionRole-endtoendml
CPU times: user 575 ms, sys: 56.1 ms, total: 631 ms
Wall time: 2.7 s


In [4]:
s3_bucket = sess.default_bucket()
s3_prefix = 'movielens'

### Data Information
*ua.base : data for training*  
*ua.test : data for test/validation*  
*Headers/columns :* ***user id | item id | rating (1-5) | timestamp***

In [6]:
!wget -N http://files.grouplens.org/datasets/movielens/ml-100k.zip

import shutil
shutil.unpack_archive('ml-100k.zip', '.')
data = pd.read_csv('./ml-100k/u.data', sep='\t', names=['USER_ID', 'ITEM_ID', 'RATING', 'TIMESTAMP'])
pd.set_option('display.max_rows', 5)
data

--2021-04-14 13:52:50--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘ml-100k.zip’ not modified on server. Omitting download.



Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP
0,196,242,3,881250949
1,186,302,3,891717742
...,...,...,...,...
99998,13,225,2,882399156
99999,12,203,3,879959583


In [7]:
train_df = pd.read_csv('./ml-100k/ua.base', sep='\t', names=['USER_ID', 'ITEM_ID', 'RATING', 'TIMESTAMP'])
test_df = pd.read_csv('./ml-100k/ua.test', sep='\t', names=['USER_ID', 'ITEM_ID', 'RATING', 'TIMESTAMP'])
pd.set_option('display.max_rows', 5)
train_df

Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP
0,1,1,5,874965758
1,1,2,3,876893171
...,...,...,...,...
90568,943,1228,3,888640275
90569,943,1330,3,888692465


In [8]:
movies_df=pd.read_csv('./ml-100k/u.item', sep='|', names=['item_id','title','release_date','video_release_date','imdb_url','UNKOWN','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Noir','Horror','Musical','Mystery','Romance','SciFi','Thriller','War','Western'],encoding='latin-1')
movies_df

Unnamed: 0,item_id,title,release_date,video_release_date,imdb_url,UNKOWN,Action,Adventure,Animation,Children,...,Fantasy,Noir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1681,1682,Scream of Stone (Schrei aus Stein) (1991),08-Mar-1996,,http://us.imdb.com/M/title-exact?Schrei%20aus%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
def max_id(fname):
    mu = 0
    mi = 0
    with open(fname) as f:
        for line in f:
            tks = line.strip().split('\t')
            if len(tks) != 4:
                continue
            mu = max(mu, int(tks[0]))
            mi = max(mi, int(tks[1]))
    return mu + 1, mi + 1
max_users, max_items = max_id('./ml-100k/ua.base')
(max_users, max_items)

(944, 1683)

## Prepare data to push to S3

In [10]:
X_train=train_df[['USER_ID','ITEM_ID']].values
y_train=train_df[['RATING']].values

In [11]:
X_test=test_df[['USER_ID','ITEM_ID']].values
y_test=test_df[['RATING']].values

In [12]:
!mkdir ./data ./data/train ./data/test

In [13]:
np.save('./data/train/train_X.npy', X_train)
np.save('./data/train/train_Y.npy', y_train)
np.save('./data/test/test_X.npy', X_test)
np.save('./data/test/test_Y.npy', y_test)

In [14]:
traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)
output_s3 = 's3://{}/{}/models/'.format(s3_bucket, s3_prefix)
code_location_s3 = 's3://{}/{}/codes'.format(s3_bucket, s3_prefix)

In [15]:
train_s3 = sess.upload_data(path='./data/train/', bucket=s3_bucket, key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./data/test/', bucket=s3_bucket, key_prefix=testdata_s3_prefix)

In [16]:
inputs = {'train':train_s3, 'test': test_s3}

## Network Architecture

In [17]:
import keras
from IPython.display import SVG
from keras.optimizers import Adam
from keras.utils.vis_utils import model_to_dot
n_latent_factors = 32

Using TensorFlow backend.


We start with a linear model architecture. This is essentially the same exercise as learning two matrices U,I such that UxI = R and we are back to the original Matrix Facotization problem. We have 3 components, user and item embeddings and a final dot product to evaluate our predicted ratings.

In [18]:
def lin_net():
    movie_input = keras.layers.Input(shape=[1],name='Item')
    movie_embedding = keras.layers.Embedding(max_items, n_latent_factors, name='Movie-Embedding')(movie_input)
    movie_vec = keras.layers.Flatten(name='FlattenMovies')(movie_embedding)

    user_input = keras.layers.Input(shape=[1],name='User')
    user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(max_users, n_latent_factors,name='User-Embedding')(user_input))

    prod = keras.layers.dot([movie_vec, user_vec],axes=1,name='DotProduct')
    model = keras.Model([user_input, movie_input], prod)
    model.compile('adam', 'mean_squared_error')
    return model

In [25]:
net=lin_net()
SVG(model_to_dot(net,  show_shapes=True, show_layer_names=True, rankdir='HB').create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [26]:
model=lin_net()
model.summary()
model.fit([X_train[:,0], X_train[:,1]], y_train, epochs=10, verbose=1)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 32)        53856       Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 32)        30208       User[0][0]                       
____________________________________________________________________________________________

<keras.callbacks.callbacks.History at 0x7eff6865f518>

## Evaluate

In [27]:
predictions = model.predict([X_test[:,0],X_test[:,1]])

In [28]:
predictions

array([[3.7902477],
       [3.4289477],
       [4.22429  ],
       ...,
       [2.740484 ],
       [2.8332167],
       [3.89367  ]], dtype=float32)

In [29]:
model.predict([X_test[0:1,0],X_test[0:1,1]])

array([[3.7902477]], dtype=float32)

In [30]:
user_nb=3
movie_nb=45
model.predict([np.array([user_nb]),np.array([movie_nb])])

array([[3.958324]], dtype=float32)

## Display recommendations

In [31]:
movies={}
for index, row in movies_df.iterrows():
    movies[int(row['item_id'])]= row['title']

In [32]:
user_nb=3
score_threshold=2
maximum_recommendations=20

In [33]:
recommended_movies=[]
for movieId in range(max_items):
    result_score=model.predict([np.array([user_nb]),np.array([movieId])])
    if result_score > score_threshold:
        recommended_movies.append([int(movieId),result_score])

In [34]:
def getVal(item):
    return item[1]
recommended_movies=sorted(recommended_movies,key=getVal,reverse=True)

In [35]:
!pip install tabulate
import tabulate
from IPython.display import HTML, display



In [36]:
output_table = [['<strong>Movie Title</strong>','<strong>Score</strong>']]
for i in range(min(maximum_recommendations,len(recommended_movies))):
    output_table.append([movies[int(recommended_movies[i][0])],recommended_movies[i][1][0][0]])

display(HTML(tabulate.tabulate(output_table, tablefmt='html')))

0,1
<strong>Movie Title</strong>,<strong>Score</strong>
Santa with Muscles (1996),4.19937801361084
"Boys, Les (1997)",4.090014457702637
"Bridge on the River Kwai, The (1957)",4.011457443237305
Pather Panchali (1955),3.960461378097534
Eat Drink Man Woman (1994),3.9583239555358887
Nikita (La Femme Nikita) (1990),3.8878262042999268
"Grand Day Out, A (1992)",3.8774654865264893
Cinema Paradiso (1988),3.867865562438965
Raise the Red Lantern (1991),3.85410737991333


## Train model on Sagemaker

In [44]:
from sagemaker.tensorflow import TensorFlow as TensorFlowEstimator

In [49]:
estimator = TensorFlowEstimator(
    framework_version="1.14",
    py_version="py3",
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    role=role,
    entry_point="keras_rec.py",
    source_dir=".",
    distributions={ "parameter_server": { "enabled": True } },
    hyperparameters={'batch-size': 32,'epochs': 10,'learning-rate': 0.1,'embedding-size':32,'max-users':max_users,'max-items':max_items},
    base_job_name="rec-keras",
    max_run=20*60,  # Maximum allowed active runtime
    use_spot_instances=True,  # Use spot instances to reduce cost
    max_wait=30*60,  # Maximum clock time (including spot delays)
)

estimator.fit(inputs)


distributions has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-04-14 14:19:27 Starting - Starting the training job...
2021-04-14 14:19:50 Starting - Launching requested ML instancesProfilerReport-1618409960: InProgress
......
2021-04-14 14:20:54 Starting - Preparing the instances for training.........
2021-04-14 14:22:10 Downloading - Downloading input data...
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
  _np_qint8 = np.dtype([("qin

UnexpectedStatusException: Error for Training job rec-keras-2021-04-14-14-19-19-642: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/local/bin/python3.6 keras_rec.py --batch-size 32 --embedding-size 32 --epochs 10 --learning-rate 0.1 --max-items 1683 --max-users 944 --model_dir s3://sagemaker-ap-southeast-1-547292691214/rec-keras-2021-04-14-14-19-19-642/model"

## Non-linear Architecture

In [37]:
def nl_net():
    movie_input = keras.layers.Input(shape=[1],name='Item')
    movie_embedding = keras.layers.Embedding(max_items, n_latent_factors, name='Movie-Embedding')(movie_input)
    movie_vec = keras.layers.Flatten(name='FlattenMovies')(movie_embedding)
    

    user_input = keras.layers.Input(shape=[1],name='User')
    user_vec = keras.layers.Flatten(name='FlattenUsers')(keras.layers.Embedding(max_users, n_latent_factors,name='User-Embedding')(user_input))

    concat = keras.layers.concatenate([movie_vec, user_vec],name='Concat')
    concat_dropout = keras.layers.Dropout(0.2)(concat)
    dense = keras.layers.Dense(200,name='FullyConnected', activation='relu')(concat)
    dropout_1 = keras.layers.Dropout(0.2,name='Dropout')(dense)
    dense_2 = keras.layers.Dense(100,name='FullyConnected-1',activation='relu')(concat)
    dropout_2 = keras.layers.Dropout(0.2,name='Dropout')(dense_2)
    dense_3 = keras.layers.Dense(50,name='FullyConnected-2', activation='relu')(dense_2)
    dropout_3 = keras.layers.Dropout(0.2,name='Dropout')(dense_3)
    dense_4 = keras.layers.Dense(20,name='FullyConnected-3', activation='relu')(dense_3)
        
    result = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)
    adam = Adam(lr=0.005)
    model = keras.Model([user_input, movie_input], result)
    model.compile(optimizer=adam,loss= 'mean_absolute_error')
    return model

In [38]:
net=nl_net()
SVG(model_to_dot(net,  show_shapes=True, show_layer_names=True, rankdir='HB').create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [39]:
model=nl_net()
model.summary()
model.fit([X_train[:,0], X_train[:,1]], y_train, epochs=10, verbose=1)

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 32)        53856       Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 32)        30208       User[0][0]                       
____________________________________________________________________________________________

<keras.callbacks.callbacks.History at 0x7eff68172e80>

In [35]:
predictions = model.predict([X_test[:,0],X_test[:,1]])

In [36]:
predictions

array([[3.9099874],
       [3.181464 ],
       [3.9850137],
       ...,
       [2.990262 ],
       [2.990262 ],
       [3.8768935]], dtype=float32)

## Train on Sagemaker

In [37]:
mxnet_estimator = MXNet('keras_rec_nl.py',
                        role=role,
                        train_instance_type='ml.c4.xlarge',
                        train_instance_count=1,
                        framework_version='1.4.1',
                        py_version='py3',
                        hyperparameters={'batch-size': 32,'epochs': 10,'learning-rate': 0.1,'embedding-size':32,'max-users':max_users,'max-items':max_items})
mxnet_estimator.fit(inputs)

2019-10-29 17:59:08 Starting - Starting the training job...
2019-10-29 17:59:12 Starting - Launching requested ML instances......
2019-10-29 18:00:14 Starting - Preparing the instances for training...
2019-10-29 18:01:06 Downloading - Downloading input data...
2019-10-29 18:01:37 Training - Training image download completed. Training in progress..[31m2019-10-29 18:01:38,218 sagemaker-containers INFO     Imported framework sagemaker_mxnet_container.training[0m
[31m2019-10-29 18:01:38,220 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-10-29 18:01:38,234 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_HOSTS': '["algo-1"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{"batch-size":32,"embedding-size":32,"epochs":10,"learning-rate":0.1,"max-items":1683,"max-users":944}', 'SM_USER_ENTRY_POINT': 'keras_rec_nl.py', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_RESOURCE_CONFIG': '{"current_host":"algo-1","hosts":["algo-1"],"netw

In [53]:
fm_predictor = mxnet_estimator.deploy(instance_type='ml.t2.medium', initial_instance_count=1)

--------------------------------------------------------------------------------------------------!

## Evaluate

In [38]:
predictions = model.predict([X_test[:,0],X_test[:,1]])

In [39]:
predictions

array([[3.9099874],
       [3.181464 ],
       [3.9850137],
       ...,
       [2.990262 ],
       [2.990262 ],
       [3.8768935]], dtype=float32)

In [40]:
user_nb=3
movie_nb=45
model.predict([np.array([user_nb]),np.array([movie_nb])])

array([[3.5918612]], dtype=float32)

## Display recommendations

In [41]:
user_nb=3
score_threshold=2
maximum_recommendations=20

In [42]:
recommended_movies=[]
for movieId in range(max_items):
    result_score=model.predict([np.array([user_nb]),np.array([movieId])])
    if result_score > score_threshold:
        recommended_movies.append([int(movieId),result_score])

In [43]:
def getVal(item):
    return item[1]
recommended_movies=sorted(recommended_movies,key=getVal,reverse=True)

In [44]:
output_table = [['<strong>Movie Title</strong>','<strong>Score</strong>']]
for i in range(min(maximum_recommendations,len(recommended_movies))):
    output_table.append([movies[int(recommended_movies[i][0])],recommended_movies[i][1][0][0]])

display(HTML(tabulate.tabulate(output_table, tablefmt='html')))

0,1
Movie Title,Score
"Saint of Fort Washington, The (1993)",4.846966743469238
Anna (1996),4.432531356811523
Santa with Muscles (1996),4.310690879821777
Prefontaine (1997),4.269065856933594
Schindler's List (1993),4.258083343505859
Casablanca (1942),4.215717315673828
"Usual Suspects, The (1995)",4.198036193847656
"Princess Bride, The (1987)",4.177254676818848
Pather Panchali (1955),4.174709320068359


***Compare the recommendation with the top 20 movies that are actually rated by that particular user, sorted from the highest rating***

In [49]:
nbUsers=943
nbMovies=1682
nbFeatures=nbUsers+nbMovies

nbRatingsTrain=90570
nbRatingsTest=9430

In [51]:
import csv
moviesByUser = {}
for userId in range(nbUsers):
    moviesByUser[str(userId)]=[]
 
with open('./ml-100k/ua.base','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,movieId,rating,timestamp in samples:
        moviesByUser[str(int(userId)-1)].append([int(movieId)-1,rating]) 

In [52]:
def find_top_rated_movies(user_id, k):
    rated_movies = moviesByUser[str(int(user_id)-1)]
    rated_movies = sorted(rated_movies,key=getVal,reverse=True)
    results = []
    
    for movie in rated_movies:
        results.append([movies[int(movie[0]+1)],movie[1]])
    return results[0:k]

output_table = [['<strong>Movie Title</strong>','<strong>Actual Rating</strong>']]
for m in find_top_rated_movies(user_nb,20):
    output_table.append(m)

display(HTML(tabulate.tabulate(output_table, tablefmt='html')))


0,1
Movie Title,Actual Rating
Paradise Lost: The Child Murders at Robin Hood Hills (1996),5
Mother (1996),5
Boogie Nights (1997),5
Jackie Brown (1997),5
Wag the Dog (1997),5
Return of the Jedi (1983),4
Event Horizon (1997),4
Schindler's List (1993),4
Cop Land (1997),4
