In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()
role = get_execution_role()

bucket = "mastering-ml-aws"

prefix = "chapter2/blazingtext"


In [27]:
SRC_PATH = '/home/ec2-user/SageMaker/mastering-ml-on-aws/chapter2/'

with open(SRC_PATH + 'dem.txt', 'r') as file:
    dem_text = ["__label__0 " + line.strip('\n') for line in file]

with open(SRC_PATH + 'gop.txt', 'r') as file:
    gop_text = ["__label__1 " + line.strip('\n') for line in file]
    
corpus = dem_text + gop_text
    
from sklearn.model_selection import train_test_split
corpus_train, corpus_test = train_test_split(corpus, test_size=0.25, random_state=42)    

In [28]:
corpus_train_txt = "\n".join(corpus_train)
corpus_test_txt = "\n".join(corpus_test)
with open('tweets.train', 'w') as file:
    file.write(corpus_train_txt)    
with open('tweets.test', 'w') as file:
    file.write(corpus_test_txt)    


In [29]:
corpus_train_txt

'__label__1 “We are forever grateful for your service.” -@FLOTUS https://t.co/22vFTZguAQ\n__label__0 RT @CecileRichards: When your strategy relies on fewer people being able to vote, you’re on the wrong side of history. https://t.co/ncthe2W…\n__label__0 RT @AFLCIO: Scott Walker. Forever a national disgrace. #1u https://t.co/Hii42QMq3w\n__label__0 Democrats will hold this administration accountable for its attacks on human rights. #HumanRightsDay https://t.co/IGzOyzxzEi\n__label__0 RT @TomPerez: Congratulations to @CheriBustos on becoming the new chair of the @dccc. The Democratic Party is back, &amp; we\'re ready to build…\n__label__0 Donald Trump\'s temper tantrum could hurt the economy and force hundreds of thousands of federal employees to work without pay or be furloughed right before the holidays. The president will own any potential government shutdown. https://t.co/dYrbFLA9j9\n__label__0 RT @HealthCareGov: The recent federal court decision is still moving through the courts, and

In [30]:

train_path = prefix + '/train'
validation_path = prefix + '/validation'

sess.upload_data(path='tweets.train', bucket=bucket, key_prefix=train_path)
sess.upload_data(path='tweets.test', bucket=bucket, key_prefix=validation_path)

s3_train_data = 's3://{}/{}'.format(bucket, train_path)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_path)

In [60]:
container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1', "blazingtext", "latest")

s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)


In [61]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [62]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=3,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=False,
                            patience=5,
                            min_epochs=5,
                            word_ngrams=2)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [63]:
bt_model.fit(inputs=data_channels, logs=True)


INFO:sagemaker:Creating training-job with name: blazingtext-2018-12-22-15-40-41-862


2018-12-22 15:40:42 Starting - Starting the training job...
2018-12-22 15:40:43 Starting - Launching requested ML instances......
2018-12-22 15:41:46 Starting - Preparing the instances for training...
2018-12-22 15:42:42 Downloading - Downloading input data..
[31mArguments: train[0m
[31m[12/22/2018 15:42:49 INFO 139992524269376] nvidia-smi took: 0.0251660346985 secs to identify 0 gpus[0m
[31m[12/22/2018 15:42:49 INFO 139992524269376] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[12/22/2018 15:42:49 INFO 139992524269376] Processing /opt/ml/input/data/train/tweets.train . File size: 0 MB[0m
[31m[12/22/2018 15:42:49 INFO 139992524269376] Processing /opt/ml/input/data/validation/tweets.test . File size: 0 MB[0m
[31mRead 0M words[0m
[31mNumber of words:  407[0m
[31m##### Alpha: -0.0003  Progress: 100.54%  Million Words/sec: 0.72 #####[0m
[31m##### Alpha: 0.0000  Progress: 100.00%  Million Words/sec: 0.72 #####
[0m
[31mTraining finished.[0

In [20]:
bt_model

<sagemaker.estimator.Estimator at 0x7f8e8c04e1d0>

'model'

In [44]:
transformer = bt_model.transformer(instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: blazingtext-2018-12-22-14-51-28-246


In [45]:
transformer.transform('s3://mastering-ml-aws/chapter2/blazingtext/validation/')

INFO:sagemaker:Creating transform job with name: blazingtext-2018-12-22-15-08-35-039


In [51]:
transformer.wait()

<sagemaker.estimator.Estimator at 0x7f8e877c91d0>

In [64]:
predictor = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')


INFO:sagemaker:Creating model with name: blazingtext-2018-12-22-15-44-01-592
INFO:sagemaker:Creating endpoint with name blazingtext-2018-12-22-15-40-41-862


---------------------------------------------------------------------------!

In [None]:
!aws s3 ls --recursive s3://mastering-ml-aws/chapter2/blazingtext
    
    


In [65]:
corpus_test_no_labels = [x[11:] for x in corpus_test]

payload = {"instances" : corpus_test_no_labels}

response = predictor.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.5003929734230042
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.500934362411499
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5003452301025391
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5022704601287842
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5004196763038635
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5000465512275696
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5001659989356995
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5006721019744873
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5006396770477295
    ],
    "label": [
      "__label__0"
    ]
  },
  {
    "prob": [
      0.5001750588417053
    ],
    "label": [
      "__label__1"
    ]
  },
  {
    "prob": [
      0.5005156397819519
    ],

In [74]:
predicted_labels = [prediction['label'][0] for prediction in predictions]


In [71]:
predicted_labels[:4]

['__label__0', '__label__1', '__label__0', '__label__0']

In [84]:
actual_labels = [x[:10] for x in corpus_test]
actual_labels[:4]

['__label__1', '__label__1', '__label__0', '__label__1']

In [86]:
matches = [(actual_label == predicted_label) for (actual_label, predicted_label) in zip(actual_labels, predicted_labels)]
matches[:4]

[False, True, True, False]

In [91]:
matches.count(True) / len(matches)

0.61