# Wilson's Morning Wake Up Playlist Generator, Modeling and Learning

## AWS SageMaker

The following steps will be executed:

* Upload your data to S3.
* Define a benchmark and candidate models and training scripts
* Train models and deploy.
* Evaluate deployed estimator.

In [1]:
# Defaults
import os
import sys

import pandas as pd
import numpy as np

from scipy.spatial.distance import cdist
from tqdm.notebook import tqdm
from sklearn.externals import joblib

import torch
import torch.optim as optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from mpl_toolkits import mplot3d

import numpy as np
import matplotlib.pyplot as plt

In [2]:
import boto3
import sagemaker

In [3]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [4]:
!ls -la data

total 648
drwxrwxr-x  3 ec2-user ec2-user   4096 Apr 15 05:22 .
drwxrwxr-x 10 ec2-user ec2-user   4096 Apr 15 10:36 ..
drwxrwxr-x  2 ec2-user ec2-user   4096 Apr 15 05:22 .ipynb_checkpoints
-rw-rw-r--  1 ec2-user ec2-user 159916 Apr 15 00:51 tensor_train.csv
-rw-rw-r--  1 ec2-user ec2-user  28467 Mar  4 23:01 test.csv
-rw-rw-r--  1 ec2-user ec2-user 186224 Apr 15 00:51 train.csv
-rw-rw-r--  1 ec2-user ec2-user 101182 Apr 15 00:51 wmw.csv
-rw-rw-r--  1 ec2-user ec2-user 166967 Apr 15 00:51 wmw_tracks.csv


## Upload your training data to S3

In [5]:
# should be the name of directory you created to save your features data
data_dir = 'data'

In [6]:
# set prefix, a descriptive name for a directory  
prefix = 'sagemaker/wmw_deep_playlist_generator'

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [7]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

pytorch-training-2020-04-15-06-10-59-220/source/sourcedir.tar.gz
pytorch-training-2020-04-15-06-24-29-105/source/sourcedir.tar.gz
pytorch-training-2020-04-15-06-50-28-851/source/sourcedir.tar.gz
pytorch-training-2020-04-15-06-56-56-010/source/sourcedir.tar.gz
pytorch-training-2020-04-15-07-02-05-077/source/sourcedir.tar.gz
pytorch-training-2020-04-15-07-11-18-087/source/sourcedir.tar.gz
pytorch-training-2020-04-15-09-14-29-568/source/sourcedir.tar.gz
pytorch-training-2020-04-15-09-19-54-968/source/sourcedir.tar.gz
pytorch-training-2020-04-15-09-20-43-042/model.tar.gz
pytorch-training-2020-04-15-09-20-43-042/source/sourcedir.tar.gz
pytorch-training-2020-04-15-09-35-00-735/source/sourcedir.tar.gz
pytorch-training-2020-04-15-09-43-29-339/model.tar.gz
pytorch-training-2020-04-15-09-43-29-339/source/sourcedir.tar.gz
pytorch-training-2020-04-15-09-58-47-352/source/sourcedir.tar.gz
pytorch-training-2020-04-15-10-09-00-811/model.tar.gz
pytorch-training-2020-04-15-10-09-00-811/source/sourcedir.

sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000034000/000000034000_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000034500/000000034500_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000035000/000000035000_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000035500/000000035500_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000036000/000000036000_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000036500/000000036500_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-07-02-05-077/debug-output/events/000000037000/000000037000_worker_0.tfevents

sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000087000/000000087000_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000087500/000000087500_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000088000/000000088000_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000088500/000000088500_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000089000/000000089000_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000089500/000000089500_worker_0.tfevents
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-09-20-43-042/debug-output/events/000000090000/000000090000_worker_0.tfevents

sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000029/000000029000_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000029/000000029500_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000030/000000030000_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000030/000000030500_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000031/000000031000_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000031/000000031500_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-2020-04-15-10-09-00-811/debug-output/index/000000032/000000032000_worker_0.json
sagemaker/wmw_deep_playlist_generator/pytorch-training-

---

# Modeling

It's time to define and train the models!

---

## Complete a training script 

To implement a custom estimator, I need to complete a `train.py` script. 

A typical training script:
* Loads training data from a specified directory
* Parses any training & model hyperparameters (ex. nodes in a neural network, training epochs, etc.)
* Instantiates a model of your design, with any specified hyperparams
* Trains that model 
* Finally, saves the model so that it can be hosted/deployed, later

### Defining and training a model

To complete a `train.py` file, you will:
1. Import any extra libraries you need
2. Define any additional model training hyperparameters using `parser.add_argument`
2. Define a model in the `if __name__ == '__main__':` section
3. Train the model in that same section


In [8]:
# Directory of train.py
!pygmentize model/LSTM_Train.py

Error: cannot read infile: [Errno 2] No such file or directory: 'model/LSTM_Train.py'


---
# Create an Estimator

When a custom model is constructed in SageMaker, an entry point must be specified. This is the Python file which will be executed when the model is trained; the `train.py` function you specified above. To run a custom training script in SageMaker, construct an estimator, and fill in the appropriate constructor arguments:

* **entry_point**: The path to the Python script SageMaker runs for training and prediction.
* **source_dir**: The path to the training script directory `source_sklearn` OR `source_pytorch`.
* **entry_point**: The path to the Python script SageMaker runs for training and prediction.
* **source_dir**: The path to the training script directory `train_sklearn` OR `train_pytorch`.
* **entry_point**: The path to the Python script SageMaker runs for training.
* **source_dir**: The path to the training script directory `train_sklearn` OR `train_pytorch`.
* **role**: Role ARN, which was specified, above.
* **train_instance_count**: The number of training instances (should be left at 1).
* **train_instance_type**: The type of SageMaker instance for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.
* **sagemaker_session**: The session used to train on Sagemaker.
* **hyperparameters** (optional): A dictionary `{'name':value, ..}` passed to the train function as hyperparameters.

Note: For a PyTorch model, there is another optional argument **framework_version**, which you can set to the latest version of PyTorch, `1.0`.

## Define PyTorch estimators

### Test run of benchmark (RNN) and candidate model (LSTM) and train components
Here I will see if the configurations I have set work accordingly with no errors. Once it runs smoothly, I will instantiate an estimator using the Sagemaker API.

In [9]:
from model import PlaylistDataset

dataset =  PlaylistDataset.PlaylistDataset(data_dir, "tensor_train.csv")

dataloader = DataLoader(dataset, batch_size=12, shuffle=False)

In [10]:
# Training function for LSTM
def train_lstm(model, train_loader, epochs, criterion, optimizer, device):
    """
    This is the training method that is called by the PyTorch training script of the LSTM model. The parameters
    passed are as follows:
    model        - The PyTorch model that we wish to train.
    train_loader - The PyTorch DataLoader that should be used during training.
    epochs       - The total number of epochs to train for.
    criterion    - The loss function used for training. 
    optimizer    - The optimizer to use during training.
    device       - Where the model and data should be loaded (gpu or cpu).
    """
    
    model.train() # Make sure that the model is in training mode.
    
    # training loop is provided
    for epoch in range(1, epochs + 1):
        
        for i, batch in enumerate(train_loader):

            optimizer.zero_grad()

            cum_loss = 0
            
            hidden_cell = model.init_hidden()
            
            for i, track in enumerate(batch):
                
                track_x = track[0]
                track_y = track[-1]
                
                output, hidden_cell = model(track_x.unsqueeze(0), hidden_cell)
                
                loss = criterion(output.squeeze(0), track_y)
                loss.backward(retain_graph=True)
                optimizer.step()
                cum_loss += loss.data.item()

            total_loss = cum_loss / len(batch[0])
            
        if epoch % 50 == 0:
            print('Epoch: {}/{}.............'.format(epoch, epochs), end=' ')
            print("Loss: {:.4f}".format(total_loss))

In [11]:
from model.LstmEstimator import LstmEstimator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm_model = LstmEstimator(9, 30, 1, 9)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
loss_fn = torch.nn.L1Loss()

train_lstm(lstm_model, dataloader, 50, loss_fn, optimizer, device)

Epoch: 50/50............. Loss: 0.1057


In [12]:
# Save LSTM
# torch.save(lstm_model.state_dict(), 'artefacts/lstm_model.pth')

In [13]:
# Training function for RNN
def train_rnn(model, dataloader, epochs, criterion, optimizer, device):
    
    model.train() # Make sure that the model is in training mode.
    
    for epoch in range(1, epochs + 1):

        for i, batch in enumerate(dataloader):
            
            cum_loss = 0
            
            hidden = model.initHidden()
            
            optimizer.zero_grad()
        
            for i, track in enumerate(batch):

                track_x = track[0]
                track_y = track[-1]
                
                output, hidden = model(track_x.unsqueeze(0), hidden)
            
                loss = criterion(output.squeeze(0), track_y)
                loss.backward()
                optimizer.step()
                cum_loss += loss.data.item()
                
            total_loss = cum_loss / len(batch[0])

        if epoch % 50 == 0:
            print('Epoch: {}/{}.............'.format(epoch, epochs), end=' ')
            print("Loss: {:.4f}".format(total_loss))

In [14]:
from model.RnnEstimator import RnnEstimator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_model = RnnEstimator(9, 30, 9)
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)
loss_fn = torch.nn.L1Loss()

train_rnn(rnn_model, dataloader, 50, loss_fn, optimizer, device)

Epoch: 50/50............. Loss: 0.1746


In [15]:
# Save RNN
# torch.save(rnn_model.state_dict(), 'artefacts/rnn_model.pth')

### Build and Train the PyTorch Model with Hyperparameter Tuning

In [13]:
# Estimator code
from sagemaker.pytorch import PyTorch

output_path = 's3://{}/{}'.format(bucket, prefix)

lstm_estimator = PyTorch(entry_point="LstmTrain.py",
                    source_dir="model",
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    output_path = output_path,
                    train_instance_type='ml.m4.xlarge',
                    hyperparameters={
                        'input_features': 9,
                        'hidden_dim': 30,
                        'hidden_layers': 1,
                        'output_dim': 9,
                        'epochs': 1500
                    })

In [17]:
# Fit estimator
lstm_estimator.fit({'train': input_data})

2020-04-15 10:09:01 Starting - Starting the training job...
2020-04-15 10:09:03 Starting - Launching requested ML instances...
2020-04-15 10:10:01 Starting - Preparing the instances for training......
2020-04-15 10:10:57 Downloading - Downloading input data......
2020-04-15 10:11:55 Training - Downloading the training image...
2020-04-15 10:12:15 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-04-15 10:12:16,138 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-04-15 10:12:16,144 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-15 10:12:16,168 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-04-15 10:12:19,187 sagemaker_pytorch_container.training INFO     Invoking user training script

[34mEpoch: 50/1500............. Loss: 0.1054[0m
[34mEpoch: 100/1500............. Loss: 0.0399[0m
[34mEpoch: 150/1500............. Loss: 0.0245[0m
[34mEpoch: 200/1500............. Loss: 0.0061[0m
[34mEpoch: 250/1500............. Loss: 0.0081[0m
[34mEpoch: 300/1500............. Loss: 0.0071[0m
[34mEpoch: 350/1500............. Loss: 0.0086[0m
[34mEpoch: 400/1500............. Loss: 0.0052[0m
[34mEpoch: 450/1500............. Loss: 0.0043[0m
[34mEpoch: 500/1500............. Loss: 0.0045[0m
[34mEpoch: 550/1500............. Loss: 0.0046[0m
[34mEpoch: 600/1500............. Loss: 0.0078[0m
[34mEpoch: 650/1500............. Loss: 0.0046[0m
[34mEpoch: 700/1500............. Loss: 0.0061[0m
[34mEpoch: 750/1500............. Loss: 0.0071[0m
[34mEpoch: 800/1500............. Loss: 0.0100[0m
[34mEpoch: 850/1500............. Loss: 0.0075[0m
[34mEpoch: 900/1500............. Loss: 0.0093[0m
[34mEpoch: 950/1500............. Loss: 0.0056[0m
[34mEpoch: 1000/1500...........

In [18]:
%%time

# deploy your model to create a predictor
# lstm_playlist_predictor = lstm_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------!CPU times: user 356 ms, sys: 20.3 ms, total: 377 ms
Wall time: 6min 32s


In [14]:
rnn_estimator = PyTorch(entry_point="RnnTrain.py",
                    source_dir="model",
                    role=role,
                    framework_version='1.4.0',
                    train_instance_count=1,
                    output_path = output_path,
                    train_instance_type='ml.m4.xlarge',
                    hyperparameters={
                        'input_features': 9,
                        'hidden_dim': 30,
                        'output_dim': 9,
                        'epochs': 1500
                    })

In [None]:
# Fit estimator
rnn_estimator.fit({'train': input_data})

2020-04-15 10:37:39 Starting - Starting the training job...
2020-04-15 10:37:41 Starting - Launching requested ML instances...
2020-04-15 10:38:39 Starting - Preparing the instances for training......
2020-04-15 10:39:18 Downloading - Downloading input data......
2020-04-15 10:40:32 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-04-15 10:40:33,190 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-04-15 10:40:33,194 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-15 10:40:33,207 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-04-15 10:40:34,622 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-04-15 10:40:34,920 sagemaker-containers INFO    

[34mEpoch: 200/1500............. Loss: 0.1769[0m
[34mEpoch: 250/1500............. Loss: 0.1763[0m
[34mEpoch: 300/1500............. Loss: 0.1776[0m
[34mEpoch: 350/1500............. Loss: 0.1739[0m
[34mEpoch: 400/1500............. Loss: 0.1720[0m
[34mEpoch: 450/1500............. Loss: 0.1741[0m
[34mEpoch: 500/1500............. Loss: 0.1729[0m
[34mEpoch: 550/1500............. Loss: 0.1751[0m
[34mEpoch: 600/1500............. Loss: 0.1751[0m
[34mEpoch: 650/1500............. Loss: 0.1752[0m
[34mEpoch: 700/1500............. Loss: 0.1750[0m
[34mEpoch: 750/1500............. Loss: 0.1738[0m
[34mEpoch: 800/1500............. Loss: 0.1750[0m
[34mEpoch: 850/1500............. Loss: 0.1756[0m
[34mEpoch: 900/1500............. Loss: 0.1736[0m
[34mEpoch: 950/1500............. Loss: 0.1748[0m
[34mEpoch: 1000/1500............. Loss: 0.1758[0m
[34mEpoch: 1050/1500............. Loss: 0.1731[0m
[34mEpoch: 1100/1500............. Loss: 0.1737[0m
[34mEpoch: 1150/1500.......

In [None]:
%%time

# deploy your model to create a predictor
# rnn_playlist_predictor = rnn_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')