# Setup

In [2]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference

import numpy as np                                # For performing matrix operations and numerical processing
import pandas as pd                               # For manipulating tabular data
from time import gmtime, strftime                 
import os 

sess = sagemaker.Session()
region = boto3.Session().region_name    
smclient = boto3.Session().client('sagemaker')

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'bravesouls/automatic-tuning-xgboost' #Replace with the prefix under which you want to store the data if needed

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-023375022819


arn:aws:iam::023375022819:role/service-role/AmazonSageMaker-ExecutionRole-20181029T121824
sagemaker-us-east-1-023375022819


### Add Data to Repo

In [None]:
!mkdir Data
!aws s3 cp s3://aws-ml-chicago-team-bravesouls/amazon_review_polarity_csv.tgz Data
!tar -xvzf Data/amazon_review_polarity_csv.tgz

In [3]:
df = pd.read_csv("amazon_review_polarity_csv/train.csv", names=['Label', 'Title', 'Review'])

In [6]:
df['Combo'] = df['Title'] + ' ' + df['Review']

# Data Preprocessing

In [3]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Create dictionary for mapping of labels

In [4]:
index_to_label = {}
index_to_label['1'] = 'negative'
index_to_label['2'] = 'positive'

In [5]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[0]]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower() + ' ' + row[2].lower()))
    return cur_row

The `transform_instance` will be applied to each data instance in parallel using python's multiprocessing module

In [6]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

The `preprocess` function will give us the test and validation sets of data.

In [42]:
%%time
# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('amazon_review_polarity_csv/train.csv', 'amazon_review_polarity_combo3.train', keep=.5)
        
# Preparing the validation dataset        
preprocess('amazon_review_polarity_csv/test.csv', 'amazon_review_polarity_combo3.validation')

CPU times: user 1min 30s, sys: 13.6 s, total: 1min 43s
Wall time: 6min 3s


In [7]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='amazon_review_polarity_combo3.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='amazon_review_polarity_combo3.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 6 s, sys: 5.76 s, total: 11.8 s
Wall time: 5.07 s


Next we need to setup an output location at S3, where the model artifact will be dumped. These artifacts are also the output of the algorithm's traning job.

In [8]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

# Training

In [11]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


Hyperparameter tuning configurations:

In [25]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "0.1",
          "MinValue": "0.005",
          "Name": "learning_rate"
        },
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "100",
          "MinValue": "0",
          "Name": "min_count"
        },
        {
          "MaxValue": "300",
          "MinValue": "32",
          "Name": "vector_dim"
        },
        {
          "MaxValue": "3",
          "MinValue": "1",
          "Name": "word_ngrams"
        },
        {
          "MaxValue": "15",
          "MinValue": "5",
          "Name": "epochs"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 20,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:accuracy",
      "Type": "Maximize"
    }
  }

Setup for Training Jobs

In [26]:
from sagemaker.amazon.amazon_estimator import get_image_uri
training_image = get_image_uri(boto3.Session().region_name, 'blazingtext', 'latest')

s3_input_train = 's3://{}/{}/train'.format(bucket, prefix)
s3_input_validation ='s3://{}/{}/validation/'.format(bucket, prefix)
     
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": training_image,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "text/plain",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_train
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "text/plain",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_validation
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,prefix)
    },
    "ResourceConfig": {
      "InstanceCount": 1,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "mode": "supervised",
      "min_epochs": "5",
      "patience": "4",
      "early_stopping": "True"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

### Running Hyperparameter tuning

In [27]:
tuning_job_name = "tm-bravesouls-tuning"
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                           HyperParameterTuningJobConfig = tuning_job_config,
                                           TrainingJobDefinition = training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:023375022819:hyper-parameter-tuning-job/tm-bravesouls-tuning',
 'ResponseMetadata': {'RequestId': 'f6a1548d-19e0-4ca3-bf1f-0fda3f2daf67',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f6a1548d-19e0-4ca3-bf1f-0fda3f2daf67',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '121',
   'date': 'Thu, 10 Jan 2019 17:23:00 GMT'},
  'RetryAttempts': 0}}