# Setup

In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
import pandas as pd


sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'bravesouls/supervised'

arn:aws:iam::023375022819:role/service-role/AmazonSageMaker-ExecutionRole-20181029T121824


INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-023375022819


sagemaker-us-east-1-023375022819


In [7]:
def prep_data():
    !mkdir Data
    !aws s3 cp s3://aws-ml-chicago-team-bravesouls/amazon_review_polarity_csv.tgz Data
    !tar -xvzf Data/amazon_review_polarity_csv.tgz
#prep_data()

In [8]:
#!head -100000 amazon_review_polarity_csv/train.csv > amazon_review_polarity_csv/train_100k.csv

# Data Processing

In [3]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + row[0]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[2].lower()))
    return cur_row

In [5]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [6]:
%%time
# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('amazon_review_polarity_csv/train.csv', 'amazon_review_polarity.train', keep=.2)
        
# Preparing the validation dataset        
preprocess('amazon_review_polarity_csv/test.csv', 'amazon_review_polarity.validation', keep=.2)

CPU times: user 46.3 s, sys: 5.83 s, total: 52.1 s
Wall time: 4min 35s


In [9]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='amazon_review_polarity.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='amazon_review_polarity.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 2.32 s, sys: 1.57 s, total: 3.89 s
Wall time: 3.79 s


In [11]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [12]:
region_name = boto3.Session().region_name

In [13]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


In [14]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         base_job_name= "Gaj-BraveSouls",
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [15]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [16]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [17]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: Gaj-BraveSouls-2019-01-09-14-46-43-374


2019-01-09 14:46:43 Starting - Starting the training job...
2019-01-09 14:46:46 Starting - Launching requested ML instances......
2019-01-09 14:47:51 Starting - Preparing the instances for training......
2019-01-09 14:49:07 Downloading - Downloading input data
2019-01-09 14:49:07 Training - Training image download completed. Training in progress..
[31mArguments: train[0m
[31m[01/09/2019 14:49:07 INFO 139969153353536] nvidia-smi took: 0.0251729488373 secs to identify 0 gpus[0m
[31m[01/09/2019 14:49:07 INFO 139969153353536] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[01/09/2019 14:49:07 INFO 139969153353536] Processing /opt/ml/input/data/train/amazon_review_polarity.train . File size: 294 MB[0m
[31m[01/09/2019 14:49:07 INFO 139969153353536] Processing /opt/ml/input/data/validation/amazon_review_polarity.validation . File size: 32 MB[0m
[31mRead 10M words[0m
[31mRead 20M words[0m
[31mRead 30M words[0m
[31mRead 40M words[0m
[31mRead 50M

In [18]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: blazingtext-2019-01-09-14-50-42-012
INFO:sagemaker:Creating endpoint with name Gaj-BraveSouls-2019-01-09-14-46-43-374


---------------------------------------------------------------------------!

In [27]:
sentences = ["This product sucks",
            "A very good buy. I really recommend",
            "Bad design. Very hard to use",
            "I'm very satisfied. The best buy ever!!"
    ]


# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      1.0000100135803223
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      1.0000100135803223
    ],
    "label": [
      "__label__2"
    ]
  },
  {
    "prob": [
      0.9999825954437256
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      1.0000100135803223
    ],
    "label": [
      "__label__2"
    ]
  }
]
