In [7]:
import os

import sagemaker
from sagemaker.pytorch import PyTorch

import numpy as np

## Upload Data

In [8]:
session = sagemaker.Session()

bucket = session.default_bucket()
s3_prefix = 'capstone/v1'

role = sagemaker.get_execution_role()

In [3]:
data_dir = 'processed'
input_data = session.upload_data(path=data_dir, bucket=bucket, key_prefix=s3_prefix)

## Train the Model

In [14]:
output_path = f's3://{bucket}/{s3_prefix}/output'

estimator = PyTorch(
    source_dir='src',
    entry_point='train.py',
    role=role,
    framework_version='1.1.0',
    py_version='py3',
    train_instance_count=1,
    train_instance_type='ml.p2.xlarge',
    output_path=output_path,
    hyperparameters={
        'seed': 1,
        'batch-size': 512,
        'epochs': 10,
        'embedding-dim': 32,
        'num-lstm-layers': 1,
        'hidden-dims': 100,
        # vocab size from previous step + 2 for
        # out of vocab and empty
        'vocab-size': 10002
    }
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
# skipped in final run in favor of the hyperparameter tuning below
estimator.fit({
    'training': os.path.join(input_data, 'train'),
    'eval': os.path.join(input_data, 'val')
})

In [None]:
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter, IntegerParameter

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name='ROC_AUC',
    objective_type='Maximize',
    max_jobs=20,
    max_parallel_jobs=2,
    metric_definitions=[{
        'Name': 'ROC_AUC',
        'Regex': 'roc_auc: (.*?);'
    }],
    early_stopping_type='Auto',
    hyperparameter_ranges={
        'embedding-dim': CategoricalParameter([32, 64]),
        'num-lstm-layers': IntegerParameter(1, 4),
        'hidden-dims': CategoricalParameter(['100', '100 64', '100 64 32', '100 64 32 16'])
    }
)

tuner.fit({
    'training': os.path.join(input_data, 'train'),
    'eval': os.path.join(input_data, 'val')
})
tuner.wait()

........................................

In [None]:
tuner.best_training_job()

## Evaluate

In [None]:
predictor = estimator.deploy(
    instance_type='ml.m4.xlarge',
    initial_instance_count=1
)

In [10]:
# I had to pull the best model from the console because the notebook crashed 
# so I skipped the prior 2 cells
from sagemaker.pytorch.model import PyTorchModel

model_artifact_location = 's3://sagemaker-us-east-1-281832773096/capstone/v1/output/sagemaker-pytorch-210327-1846-005-c22ae97e/output/model.tar.gz'

model = PyTorchModel(
    model_data=model_artifact_location,
    role=role,
    source_dir='src',
    entry_point='train.py',
    framework_version='1.1.0',
    py_version='py3',
)

predictor = model.deploy(
    instance_type='ml.m4.xlarge',
    initial_instance_count=1
)

---------------!

In [9]:
! pip install symspellpy



In [9]:
from importlib import reload  
import src.utils
reload(src.utils)
import joblib
with open('processed/vocab.joblib', 'rb') as f:
    vocab = joblib.load(f)

sentence = 'You bad, you are the worst person alive!!!'
input_vec = src.utils.encode_single_input(sentence, vocab)
response = predictor.predict(input_vec)

In [11]:
response

array([[0.88455814, 0.00102099, 0.01581585, 0.01653206, 0.08217296,
        0.01339703]], dtype=float32)

## Evaluate with Final Test Set

This section was only used on the final run.

In [11]:
import joblib
with open('processed/test/data.joblib', 'rb') as f:
    test_data = joblib.load(f)

y_true = test_data[:, :6]
X = test_data[:, 6:]

In [None]:
y_pred = np.array([predictor.predict(chunk) for chunk in np.split(X, range(1000, X.shape[0], 1000))])
y_pred = np.concatenate(y_pred, axis=0)
y_pred_class = y_pred.round()

In [8]:
y_pred.shape, y_true.shape

((40239, 6), (22355, 6))

In [7]:
with open('processed/y_pred.joblib', 'wb') as f:
    joblib.dump(y_pred, f)

  if __name__ == '__main__':


In [54]:
predictor.delete_endpoint()

## Evaluate Examples

In [19]:
import pandas as pd
import src.utils

full_df = pd.read_csv('data/raw_combined.csv')
vocab, encoded_text = src.utils.encode_text(
    full_df, 
    use_cache=True,
    max_length=500,
    vocab_length=10000
)

In [20]:
# why is class 6, identity hate, so poorly performing?
# pick a few false positives and false negatives

from importlib import reload  
import src.utils
reload(src.utils)

mask = (y_pred[:, 5] < 0.5) & (y_true[:, 5] == 1)
false_neg = X[mask]
false_neg_pred = y_pred[mask]
false_neg_true = y_true[mask]

idx = 10
for i, row in enumerate(encoded_text):
    if np.equal(row, false_neg[idx]).all():
        print(full_df.iloc[i]['comment_text'])
        break
print()
print(' '.join(src.utils.decode_text(false_neg[idx], vocab)))
print(false_neg_pred[idx])
print(false_neg_true[idx])

FUCK YOU Ckatz you are german Cock sucker and FUCKER MOTHER FUCKER

fuck <mvt> german cock sucker fucker mother fucker
[0.997106   0.5659401  0.9982394  0.01774021 0.9745939  0.15446553]
[1 0 1 0 1 1]


Wow, these comments are awful. But importantly there are lots of misspelled words here. Using a spelling correction tool in the preprocessing would likely help a lot.