# Setup

In [7]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
import pandas as pd


sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'bravesouls/supervised/title'

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-023375022819


arn:aws:iam::023375022819:role/service-role/AmazonSageMaker-ExecutionRole-20181029T121824
sagemaker-us-east-1-023375022819


In [8]:
def prep_data():
    !mkdir Data
    !aws s3 cp s3://aws-ml-chicago-team-bravesouls/amazon_review_polarity_csv.tgz Data
    !tar -xvzf Data/amazon_review_polarity_csv.tgz
#prep_data()

In [8]:
#!head -100000 amazon_review_polarity_csv/train.csv > amazon_review_polarity_csv/train_100k.csv

# Data Processing

In [9]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + row[0]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    return cur_row

In [11]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [13]:
%%time
# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('/home/ec2-user/SageMaker/blazingtext_text_classification_dbpedia_2019-01-08/amazon_review_polarity_csv/train.csv', 'amazon_review_polarity_title.train', keep=.2)
        
# Preparing the validation dataset        
preprocess('/home/ec2-user/SageMaker/blazingtext_text_classification_dbpedia_2019-01-08/amazon_review_polarity_csv/test.csv', 'amazon_review_polarity_title.validation', keep=.2)

CPU times: user 27.3 s, sys: 3.39 s, total: 30.7 s
Wall time: 3min 20s


In [14]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='amazon_review_polarity_title.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='amazon_review_polarity_title.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 142 ms, sys: 58.7 ms, total: 201 ms
Wall time: 9.93 s


In [15]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [16]:
region_name = boto3.Session().region_name

In [17]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


In [18]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         base_job_name= "SeanW-BraveSouls-Title",
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [19]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [20]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [21]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: SeanW-BraveSouls-Title-2019-01-09-16-26-39-941


2019-01-09 16:26:40 Starting - Starting the training job...
2019-01-09 16:26:43 Starting - Launching requested ML instances......
2019-01-09 16:27:46 Starting - Preparing the instances for training......
2019-01-09 16:29:08 Downloading - Downloading input data
2019-01-09 16:29:08 Training - Downloading the training image.
[31mArguments: train[0m
[31m[01/09/2019 16:29:13 INFO 139682759448384] nvidia-smi took: 0.025171995163 secs to identify 0 gpus[0m
[31m[01/09/2019 16:29:13 INFO 139682759448384] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[01/09/2019 16:29:13 INFO 139682759448384] Processing /opt/ml/input/data/train/amazon_review_polarity_title.train . File size: 25 MB[0m
[31m[01/09/2019 16:29:13 INFO 139682759448384] Processing /opt/ml/input/data/validation/amazon_review_polarity_title.validation . File size: 2 MB[0m
[31mRead 5M words[0m
[31mNumber of words:  42807[0m
[31mLoading validation data from /opt/ml/input/data/validation/amazon

In [23]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: blazingtext-2019-01-09-16-30-40-448
INFO:sagemaker:Creating endpoint with name SeanW-BraveSouls-Title-2019-01-09-16-26-39-941


---------------------------------------------------------------------------!

In [24]:
sentences = ["This product sucks",
            "A very good buy. I really recommend",
            "Bad design. Very hard to use",
            "I'm very satisfied. The best buy ever!!"
    ]


# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.999594509601593
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.9968206286430359
    ],
    "label": [
      "__label__2"
    ]
  },
  {
    "prob": [
      0.9603398442268372
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.9994950294494629
    ],
    "label": [
      "__label__2"
    ]
  }
]
