# Word-level language modeling RNN

In [1]:
import os
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = 'arn:aws:iam::142577830533:role/SageMakerRole'#get_execution_role()

# Download training and test data
We use raw data from the wikitext-2 dataset:
https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/


In [2]:
# script to download dataset
import os
if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
print('workbookDir: ' + workbookDir)
data_dir = os.path.join(workbookDir, 'data', 'wikitext-2')
print('data_dir: ' + data_dir)


workbookDir: /workplace/nadzeya/sagemaker-pytorch-containers/notebooks/rnn
data_dir: /workplace/nadzeya/sagemaker-pytorch-containers/notebooks/rnn/data/wikitext-2


# Uploading the data
We use the sagemaker.Session.upload_data function to upload our datasets to an S3 location. The return value inputs identifies the location -- we will use this later when we start the training job.



In [3]:
inputs = sagemaker_session.upload_data(path=data_dir, key_prefix='data/DEMO-pytorch-rnn')
print('input spec (in this case, just an S3 path): {}'.format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-west-2-142577830533/data/DEMO-pytorch-rnn


In [4]:
!cat 'source/rnn.py'

# Based on github.com/pytorch/examples/blob/master/word_language_model
import time
import logging
import math
import os
import torch
import torch.nn as nn

import data

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
print("user script!!!")
print("user script!!!")
print("user script!!!")
print("user script!!!")
print("user script!!!")
print("user script!!!")


class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            exce

# Run the training script on SageMaker
The PyTorch class allows us to run our training function as a distributed training job on SageMaker infrastructure. We need to configure it with our training script, an IAM role, the number of training instances, and the training instance type. In this case we will run our training job on ml.p2.xlarge instance.

In [5]:
estimator = PyTorch(entry_point="rnn.py",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.8xlarge',
                    source_dir='source',
                    hyperparameters={'batch_size': 30, 'epochs': 50})

After we've constructed our PyTorch object, we can fit it using the data we uploaded to S3. SageMaker makes sure our data is available in the local filesystem, so our training script can simply read the data from disk.

In [7]:
estimator.fit({'wikitext-2': inputs})

INFO:sagemaker:Creating training-job with name: sagemaker-pytorch-2018-04-30-20-15-06-933


...............................................................
[31m2018-04-30 20:20:10,345 INFO - root - running container entrypoint[0m
[31m2018-04-30 20:20:10,345 INFO - root - starting train task[0m
[31m2018-04-30 20:20:10,408 INFO - container_support.app - started training: {'train_fn': <function train at 0x7fea4538b488>}[0m
[31mDownloading s3://sagemaker-us-west-2-142577830533/sagemaker-pytorch-2018-04-30-20-15-06-933/source/sourcedir.tar.gz to /tmp/script.tar.gz[0m
[31m2018-04-30 20:20:10,555 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTP connection (1): 169.254.170.2[0m
[31m2018-04-30 20:20:10,658 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (1): sagemaker-us-west-2-142577830533.s3.amazonaws.com[0m
[31m2018-04-30 20:20:10,711 INFO - botocore.vendored.requests.packages.urllib3.connectionpool - Starting new HTTPS connection (2): sagemaker-us-west-2-142577830533.s3.amazonaws.co

[31m-----------------------------------------------------------------------------------------[0m
[31m| end of epoch  14 | time: 12.74s | valid loss  5.71 | valid ppl   303.21[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| epoch  15 |   200/  233 batches | lr 0.31 | ms/batch 39.00 | loss  4.34 | ppl    76.47[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| end of epoch  15 | time: 12.72s | valid loss  5.71 | valid ppl   303.35[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| epoch  16 |   200/  233 batches | lr 0.08 | ms/batch 39.00 | loss  4.33 | ppl    76.09[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| end of epoch  16 | time: 12.75s | valid loss  5.71 | valid ppl   303.02[0m
[31m-------------------------------------------------------

[31m| epoch  36 |   200/  233 batches | lr 0.00 | ms/batch 39.07 | loss  4.32 | ppl    75.53[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| end of epoch  36 | time: 12.75s | valid loss  5.71 | valid ppl   303.00[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| epoch  37 |   200/  233 batches | lr 0.00 | ms/batch 39.08 | loss  4.32 | ppl    75.53[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| end of epoch  37 | time: 12.77s | valid loss  5.71 | valid ppl   303.00[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| epoch  38 |   200/  233 batches | lr 0.00 | ms/batch 39.13 | loss  4.32 | ppl    75.56[0m
[31m-----------------------------------------------------------------------------------------[0m
[31m| end of epoch  38 | time: 12.79s | vali

# Implement the training function
We need to provide a training script that can run on the SageMaker platform. The training scripts are essentially the same as one you would write for local training, except that you need to provide a train function. When SageMaker calls your function, it will pass in arguments that describe the training environment. Check the script below to see how this works.

In [None]:
from sagemaker.fw_utils import create_image_uri, framework_name_from_image, framework_version_from_tag
DATA_DIR = 'data_dir'
BUCKET_NAME = 'mybucket'
ROLE = 'Sagemaker'
REGION = 'us-west-2'
SCRIPT_PATH = 'script.py'
image_uri = create_image_uri('mars-south-3', 'mlfw', 'ml.c4.large', '1.0rc', 'py2')
print(image_uri)