# Intializing Notebook

In [2]:
import boto3
import re
from sagemaker import get_execution_role
from time import gmtime, strftime
import time
import numpy as np
import os
import json

role = get_execution_role()
bucket = 'slytherins-test'
prefix = 'seq2seq-E2G'
region_name = boto3.Session().region_name

# Downloading Dataset

In [4]:
! wget http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.de.gz

! wget http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.en.gz

! gunzip corpus.tc.de.gz

! gunzip corpus.tc.en.gz

! mkdir validation

! curl http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/dev.tgz | tar xvzf - -C validation

newstest2014-deen-ref.de.sgm
newstest2014-deen-ref.en.sgm
newstest2014-deen-src.de.sgm
newstest2014-deen-src.en.sgm
newstest2014.tc.de
newstest2014.tc.en
newstest2015-deen-ref.en.sgm
newstest2015-deen-src.de.sgm
newstest2015-ende-ref.de.sgm
newstest2015-ende-src.en.sgm
newstest2015.tc.de
newstest2015.tc.en
newstest2016-deen-ref.en.sgm
newstest2016-deen-src.de.sgm
newstest2016-ende-ref.de.sgm
newstest2016-ende-src.en.sgm
newstest2016.tc.de
newstest2016.tc.en


--2020-05-24 06:55:50--  http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.de.gz
--2020-05-24 06:55:50--  http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.en.gz
Resolving data.statmt.org (data.statmt.org)... Resolving data.statmt.org (data.statmt.org)... 129.215.197.184129.215.197.184
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:80... 
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... connected.
HTTP request sent, awaiting response... 200 OK
Length: 263407937 (251M) [application/x-gzip]
Saving to: ‘corpus.tc.en.gz’

     0K .......... ...200 OK
Length: 299680879 (286M) [application/x-gzip]
Saving to: ‘corpus.tc.de.gz’

     0K .......... .......... .......... .......... ......... .......... .......... ..........  0%  233K 18m24s
    50K .......... .......... .......... .......... ................  0%  234K 20m51s
    50K .......... .......... ..

# Creating a small Dataset

In [5]:
! head -n 10000 corpus.tc.en > corpus.tc.en.small
! head -n 10000 corpus.tc.de > corpus.tc.de.small

# Performing Basic NLP Tasks to make Vocabulary

In [6]:
%%bash
python3 create_vocab_proto.py \
        --train-source corpus.tc.en.small \
        --train-target corpus.tc.de.small \
        --val-source validation/newstest2014.tc.en \
        --val-target validation/newstest2014.tc.de

INFO:__main__:Building vocabulary from dataset: corpus.tc.en.small and corpus.tc.de.small
INFO:__main__:Final vocabulary: 10661 types (min frequency 1, top 50000 types)
INFO:__main__:Final vocabulary: 15980 types (min frequency 1, top 50000 types)
INFO:__main__:Source vocabulary size: 10661 
INFO:__main__:Vocabulary saved to "vocab.src.json"
INFO:__main__:Target vocabulary size: 15980 
INFO:__main__:Vocabulary saved to "vocab.trg.json"
INFO:__main__:Spawning 1 encoding worker(s) for encoding train datasets!
INFO:__main__:Processed 10000 lines for encoding to protobuf. 0 lines were ignored as they didn't have
                any content in either the source or the target file!
INFO:__main__:Completed writing the encoding queue!
INFO:__main__:Encoding finished! Writing records to "train.rec"
INFO:__main__:Processed input and saved to "train.rec"
INFO:__main__:Spawning 1 encoding worker(s) for encoding validation datasets!
INFO:__main__:Processed 3003 lines for encoding to protobuf. 0 lin

CPU times: user 7.29 ms, sys: 0 ns, total: 7.29 ms
Wall time: 2.75 s


# Uploading Files to S3

In [7]:
def upload_to_s3(bucket, prefix, channel, file):
    s3 = boto3.resource('s3')
    data = open(file, "rb")
    key = prefix + "/" + channel + '/' + file
    s3.Bucket(bucket).put_object(Key=key, Body=data)

upload_to_s3(bucket, prefix, 'train', 'train.rec')
upload_to_s3(bucket, prefix, 'validation', 'val.rec')
upload_to_s3(bucket, prefix, 'vocab', 'vocab.src.json')
upload_to_s3(bucket, prefix, 'vocab', 'vocab.trg.json')

# Initializing Container

In [9]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region_name, 'seq2seq')

Using SageMaker Seq2Seq container: 825641698319.dkr.ecr.us-east-2.amazonaws.com/seq2seq:1 (us-east-2)


# Training the Model

In [10]:
job_name = 'seq2seq-E2G'
print("Training job", job_name)

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "ResourceConfig": {
        # Seq2Seq does not support multiple machines. Currently, it only supports single machine, multiple GPUs
        "InstanceCount": 1,
        "InstanceType": "ml.m4.xlarge", # We suggest one of ["ml.p2.16xlarge", "ml.p2.8xlarge", "ml.p2.xlarge"]
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        # Please refer to the documentation for complete list of parameters
        "max_seq_len_source": "60",
        "max_seq_len_target": "60",
        "optimized_metric": "bleu",
        "batch_size": "64", # Please use a larger batch size (256 or 512) if using ml.p2.8xlarge or ml.p2.16xlarge
        "checkpoint_frequency_num_batches": "1000",
        "rnn_num_hidden": "512",
        "num_layers_encoder": "1",
        "num_layers_decoder": "1",
        "num_embed_source": "512",
        "num_embed_target": "512"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 48 * 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        },
        {
            "ChannelName": "vocab",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/vocab/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
        }
    ]
}

sagemaker_client = boto3.Session().client(service_name='sagemaker')
sagemaker_client.create_training_job(**create_training_params)

status = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)

Training job seq2seq-E2G


ClientError: An error occurred (ValidationException) when calling the CreateTrainingJob operation: Instance type ml.m4.xlarge is not supported by algorithm seq2seq; only GPU instances are supported.

# Getting Model Status

In [None]:
status = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
# if the job failed, determine why
if status == 'Failed':
    message = sagemaker_client.describe_training_job(TrainingJobName=job_name)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

# Download Pretrained Model

In [11]:
use_pretrained_model = True
model_name = "DEMO-pretrained-en-de-model"
!curl https://s3-us-west-2.amazonaws.com/seq2seq-data/model.tar.gz > model.tar.gz
!curl https://s3-us-west-2.amazonaws.com/seq2seq-data/vocab.src.json > vocab.src.json
!curl https://s3-us-west-2.amazonaws.com/seq2seq-data/vocab.trg.json > vocab.trg.json
upload_to_s3(bucket, prefix, 'pretrained_model', 'model.tar.gz')
model_data = "s3://{}/{}/pretrained_model/model.tar.gz".format(bucket, prefix)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  336M  100  336M    0     0  13.0M      0  0:00:25  0:00:25 --:--:-- 12.1M 9 31.5M    0     0  10.5M      0  0:00:31  0:00:02  0:00:29 10.5M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   243    0   243    0     0    716      0 --:--:-- --:--:-- --:--:--   714
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   243    0   243    0     0    776      0 --:--:-- --:--:-- --:--:--   776


# Create Model

In [12]:
%%time

sage = boto3.client('sagemaker')

if not use_pretrained_model:
    info = sage.describe_training_job(TrainingJobName=job_name)
    model_name=job_name
    model_data = info['ModelArtifacts']['S3ModelArtifacts']

print(model_name)
print(model_data)

primary_container = {
    'Image': container,
    'ModelDataUrl': model_data
}

create_model_response = sage.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = primary_container)

print(create_model_response['ModelArn'])

DEMO-pretrained-en-de-model
s3://slytherins-test/seq2seq-E2G/pretrained_model/model.tar.gz
arn:aws:sagemaker:us-east-2:809912564797:model/demo-pretrained-en-de-model
CPU times: user 30.9 ms, sys: 0 ns, total: 30.9 ms
Wall time: 440 ms


# Create Endpoint Configuration

In [13]:
from time import gmtime, strftime

endpoint_config_name = 'DEMO-Seq2SeqEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sage.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m4.xlarge',
        'InitialInstanceCount':1,
        'ModelName':model_name,
        'VariantName':'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

DEMO-Seq2SeqEndpointConfig-2020-05-24-07-00-22
Endpoint Config Arn: arn:aws:sagemaker:us-east-2:809912564797:endpoint-config/demo-seq2seqendpointconfig-2020-05-24-07-00-22


# Create Endpoint

In [14]:
%%time
import time

endpoint_name = 'DEMO-Seq2SeqEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sage.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
print(create_endpoint_response['EndpointArn'])

resp = sage.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

# wait until the status has changed
sage.get_waiter('endpoint_in_service').wait(EndpointName=endpoint_name)

# print the status of the endpoint
endpoint_response = sage.describe_endpoint(EndpointName=endpoint_name)
status = endpoint_response['EndpointStatus']
print('Endpoint creation ended with EndpointStatus = {}'.format(status))

if status != 'InService':
    raise Exception('Endpoint creation failed.')

DEMO-Seq2SeqEndpoint-2020-05-24-07-00-28
arn:aws:sagemaker:us-east-2:809912564797:endpoint/demo-seq2seqendpoint-2020-05-24-07-00-28
Status: Creating
Endpoint creation ended with EndpointStatus = InService
CPU times: user 215 ms, sys: 0 ns, total: 215 ms
Wall time: 7min 31s


# Lets Perform Inference

In [15]:
runtime = boto3.client(service_name='runtime.sagemaker') 

In [16]:
sentences = ["you are so good !",
             "can you drive a car ?",
             "i want to watch a movie ."
            ]

payload = {"instances" : []}
for sent in sentences:
    payload["instances"].append({"data" : sent})

response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                   ContentType='application/json', 
                                   Body=json.dumps(payload))

response = response["Body"].read().decode("utf-8")
response = json.loads(response)
print(response)

{'predictions': [{'target': 'Sie sind so gut !'}, {'target': 'Können Sie ein Auto fahren ?'}, {'target': 'i want to watch a movie .'}]}


# Delete the Endpoint

In [18]:
sage.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'c0bff543-ac01-4471-bc04-405871d83e0e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c0bff543-ac01-4471-bc04-405871d83e0e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 24 May 2020 07:09:13 GMT'},
  'RetryAttempts': 0}}