# Word-level language modeling RNN

In [25]:
import os
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = 'arn:aws:iam::142577830533:role/SageMakerRole'#get_execution_role()

# Download training and test data
We use raw data from the wikitext-2 dataset:
https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/


In [26]:
# script to download dataset
import os
if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
print('workbookDir: ' + workbookDir)
data_dir = os.path.join(workbookDir, 'data', 'wikitext-2')
print('data_dir: ' + data_dir)


workbookDir: /workplace/nadzeya/sagemaker-pytorch-containers/notebooks/rnn
data_dir: /workplace/nadzeya/sagemaker-pytorch-containers/notebooks/rnn/data/wikitext-2


# Uploading the data
We use the sagemaker.Session.upload_data function to upload our datasets to an S3 location. The return value inputs identifies the location -- we will use this later when we start the training job.



In [27]:
inputs = sagemaker_session.upload_data(path=data_dir, key_prefix='data/DEMO-pytorch-rnn')
print('input spec (in this case, just an S3 path): {}'.format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-west-2-142577830533/data/DEMO-pytorch-rnn


# Implement the training function
We need to provide a training script that can run on the SageMaker platform. The training scripts are essentially the same as one you would write for local training, except that you need to provide a train function. When SageMaker calls your function, it will pass in arguments that describe the training environment. Check the script below to see how this works.

In [28]:
!cat 'rnn.py'

# Based on github.com/pytorch/examples/blob/master/word_language_model
import time
import logging
import math
import os
import torch
import torch.nn as nn

import data

logger = logging.getLogger(__name__)


class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU

# Run the training script on SageMaker
The PyTorch class allows us to run our training function as a distributed training job on SageMaker infrastructure. We need to configure it with our training script, an IAM role, the number of training instances, and the training instance type. In this case we will run our training job on ml.p2.xlarge instance.

In [31]:
estimator = PyTorch("rnn.py", 
                    role=role, 
                    train_instance_count=1, 
                    train_instance_type='local',#"ml.p2.xlarge",
                    hyperparameters={'batch_size': 30, 'epochs': 50})

After we've constructed our PyTorch object, we can fit it using the data we uploaded to S3. SageMaker makes sure our data is available in the local filesystem, so our training script can simply read the data from disk.

In [32]:
estimator.fit({'training': inputs})

INFO:sagemaker:Creating training-job with name: sagemaker-pytorch-2018-04-26-23-46-02-655


0.3-cpu-py3: Pulling from sagemaker-pytorch
Digest: sha256:d56e728d4820056a473aed0391e096a6dff874b889147d94674435676059ee14
Status: Downloaded newer image for 520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-pytorch:0.3-cpu-py3
Attaching to tmpdgqcgf_algo-1-LI79P_1
[36malgo-1-LI79P_1  |[0m 2018-04-26 23:46:20,048 INFO - root - running container entrypoint
[36malgo-1-LI79P_1  |[0m 2018-04-26 23:46:20,048 INFO - root - starting train task
[36malgo-1-LI79P_1  |[0m 2018-04-26 23:46:20,065 INFO - container_support.app - started training: {'train_fn': <function train at 0x7f0d4ca67b70>}
[36malgo-1-LI79P_1  |[0m Downloading s3://sagemaker-us-west-2-142577830533/sagemaker-pytorch-2018-04-26-23-46-02-655/source/sourcedir.tar.gz to /tmp/script.tar.gz
[36malgo-1-LI79P_1  |[0m 2018-04-26 23:46:20,112 INFO - botocore.credentials - Found credentials in environment variables.
[36malgo-1-LI79P_1  |[0m 2018-04-26 23:46:20,177 INFO - botocore.vendored.requests.packages.urllib3.connecti