# Setup

In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
import pandas as pd

sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket() # Replace with your own bucket name if needed
print(bucket)
prefix = 'bravesouls/supervised' #Replace with the prefix under which you want to store the data if needed

arn:aws:iam::023375022819:role/service-role/AmazonSageMaker-ExecutionRole-20181029T121824


INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-023375022819


sagemaker-us-east-1-023375022819


### Add Data to Repo

In [None]:
!mkdir Data
!aws s3 cp s3://aws-ml-chicago-team-bravesouls/amazon_review_polarity_csv.tgz Data
!tar -xvzf Data/amazon_review_polarity_csv.tgz

In [3]:
df = pd.read_csv("amazon_review_polarity_csv/train.csv", names=['Label', 'Title', 'Review'])

In [6]:
df['Combo'] = df['Title'] + ' ' + df['Review']

# Data Preprocessing

In [38]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Create dictionary for mapping of labels

In [39]:
index_to_label = {}
index_to_label['1'] = 'negative'
index_to_label['2'] = 'positive'

In [40]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[0]]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower() + ' ' + row[2].lower()))
    return cur_row

The `transform_instance` will be applied to each data instance in parallel using python's multiprocessing module

In [41]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

The `preprocess` function will give us the test and validation sets of data.

In [42]:
%%time
# Preparing the training dataset

# Since preprocessing the whole dataset might take a couple of mintutes,
# we keep 20% of the training dataset for this demo.
# Set keep to 1 if you want to use the complete dataset
preprocess('amazon_review_polarity_csv/train.csv', 'amazon_review_polarity_combo3.train', keep=.5)
        
# Preparing the validation dataset        
preprocess('amazon_review_polarity_csv/test.csv', 'amazon_review_polarity_combo3.validation')

CPU times: user 1min 30s, sys: 13.6 s, total: 1min 43s
Wall time: 6min 3s


In [9]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='amazon_review_polarity_combo3.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='amazon_review_polarity_combo3.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 5.12 s, sys: 3.71 s, total: 8.82 s
Wall time: 8.59 s


Next we need to setup an output location at S3, where the model artifact will be dumped. These artifacts are also the output of the algorithm's traning job.

In [2]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

# Training

In [3]:
region_name = boto3.Session().region_name

In [4]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


In [13]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         base_job_name= "TM-Bravesouls-half",
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [15]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [10]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')

validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                              content_type='text/plain', s3_data_type='S3Prefix')

data_channels = {'train': train_data, 'validation': validation_data}

In [16]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: TM-Bravesouls-half-2019-01-09-22-54-10-108


2019-01-09 22:54:10 Starting - Starting the training job...
2019-01-09 22:54:22 Starting - Launching requested ML instances......
2019-01-09 22:55:27 Starting - Preparing the instances for training...
2019-01-09 22:56:06 Downloading - Downloading input data.....
[31mArguments: train[0m
[31m[01/09/2019 22:56:45 INFO 140572560639808] nvidia-smi took: 0.0251631736755 secs to identify 0 gpus[0m
[31m[01/09/2019 22:56:45 INFO 140572560639808] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[01/09/2019 22:56:45 INFO 140572560639808] 5 files found in train channel. Using /opt/ml/input/data/train/amazon_review_polarity_combo3.train for training...[0m
[31m[01/09/2019 22:56:45 INFO 140572560639808] Processing /opt/ml/input/data/train/amazon_review_polarity_combo3.train . File size: 793 MB[0m
[31m[01/09/2019 22:56:45 INFO 140572560639808] 5 files found in validation channel. Using /opt/ml/input/data/validation/amazon_review_polarity_combo1.validation for tr

In [35]:
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: blazingtext-2019-01-09-17-29-00-848
INFO:sagemaker:Creating endpoint with name TM-Bravesouls-2019-01-09-17-24-09-597


--------------------------------------------------------------!

In [47]:
sentences = [
            "TCL Roku TVs are great! But... "
    ]


# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      1.0000097751617432
    ],
    "label": [
      "__label__positive"
    ]
  }
]
