### Load Packages

In [1]:
import os
import pandas as pd
import torch
import torch.utils.data
import torch.optim as optim
from train.model import LSTMClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import RealTimePredictor
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

In [3]:
from enhanced.utils import download_data
from enhanced.preparation import read_imdb_data, prepare_imdb_data, split_train_validation_data
from enhanced.processing import review_to_words, preprocess_data, build_dict
from enhanced.processing import update_save_dict, convert_and_pad_data
from enhanced.training import train
from enhanced.prediction import predict, test_reviews

### Configurations

In [4]:
# Raw data parameters
data_dir = "data"
raw_folder = "aclImdb"
data_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filename = "aclImdb_v1.tar.gz"

# Data processing parameters
vocab_size = 5000
sample_idx = 42
cache_dir = os.path.join("cache", "sentiment_analysis")
pytorch_dir = 'data/pytorch'

# Sagemaker Parameters
region = "us-east-1"
bucket_prefix = 'sagemaker/sentiment_rnn'
account_role = "arn:aws:iam::977053370764:role/service-role/AmazonSageMaker-ExecutionRole-20201202T141643"

size_pred = 512

### Step 1: Downloading the data

In [5]:
download_data(data_dir, data_url, filename)

data already downloaded


### Step 2: Processing raw data

In [6]:
data, labels = read_imdb_data(data_dir, raw_folder)

In [7]:
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


### Step 3: Split train-validation-test data

In [8]:
train_valid_X, test_X, train_valid_y, test_y = prepare_imdb_data(data, labels)

In [9]:
train_X, valid_X, train_y, valid_y = split_train_validation_data(train_valid_X, train_valid_y, valid_size=0.05)

In [10]:
print("IMDb reviews (combined): train = {}, valid = {}, test = {}".format(len(train_X), len(valid_X),len(test_X)))

IMDb reviews (combined): train = 23750, valid = 1250, test = 25000


In [11]:
print("IMDB reviews: train = {} pos / {} neg, validation = {} pos / {} neg".format(
            sum(train_y), len(train_y)-sum(train_y),
            sum(valid_y), len(valid_y)-sum(valid_y)))

IMDB reviews: train = 11869 pos / 11881 neg, validation = 631 pos / 619 neg


#### Check reviews

In [12]:
print(f"sentiment: {train_y[sample_idx]}")
print(f"review: {train_X[sample_idx]}")

sentiment: 1
review: Absolutely wonderful drama and Ros is top notch...I highly recommend this movie. Her performance, in my opinion, was Academy Award material! The only real sad fact here is that Universal hasn't seen to it that this movie was ever available on any video format, whether it be tape or DVD. They are ignoring a VERY good movie. But Universal has little regard for its library on DVD, which is sad. If you get the chance to see this somewhere (not sure why it is rarely even run on cable), see it! I won't go into the story because I think most people would rather have an opinion on the film, and too many "reviewers" spend hours writing about the story, which is available anywhere.<br /><br />a 10!


In [13]:
print(f"sentiment: {valid_y[sample_idx]}")
print(f"review: {valid_X[sample_idx]}")

sentiment: 0
review: This is your typical Priyadarshan movie--a bunch of loony characters out on some silly mission. His signature climax has the entire cast of the film coming together and fighting each other in some crazy moshpit over hidden money. Whether it is a winning lottery ticket in Malamaal Weekly, black money in Hera Pheri, "kodokoo" in Phir Hera Pheri, etc., etc., the director is becoming ridiculously predictable. Don't get me wrong; as clichéd and preposterous his movies may be, I usually end up enjoying the comedy. However, in most his previous movies there has actually been some good humor, (Hungama and Hera Pheri being noteworthy ones). Now, the hilarity of his films is fading as he is using the same formula over and over again.<br /><br />Songs are good. Tanushree Datta looks awesome. Rajpal Yadav is irritating, and Tusshar is not a whole lot better. Kunal Khemu is OK, and Sharman Joshi is the best.


In [14]:
print(f"sentiment: {test_y[sample_idx]}")
print(f"review: {test_X[sample_idx]}")

sentiment: 1
review: Final Solution is a powerful christian film that shows the hate between the black and whites that was present in the days of apartheid. It shows how this hate was contrived and was groomed from generation to generation. Jan Ellis was taught that a black man was a plague. He was raised to be that way. <br /><br />Then he meets a man who is on the opposite side of his beliefs, Pastor Lekota. will he change his ways?. The film is a powerful movie that shows the perceptions the different races had for one another, it shows these perceptions with quite a lot of accuracy. The movie shows the world of how apartheid affected the psyche of blacks and whites.<br /><br />This is a great film that everyone should watch.


### Step 4: Processing data

In [15]:
train_pp_X, test_pp_X, valid_pp_X, train_pp_y, test_pp_y, valid_pp_y = preprocess_data(train_X, test_X, valid_X,
                                                                           train_y, test_y, valid_y, cache_dir)

Read preprocessed data from cache file: preprocessed_data.pkl


#### Check tokenization

In [None]:
print(review_to_words(train_X[sample_idx]))

In [None]:
print(review_to_words(valid_X[sample_idx]))

In [None]:
print(review_to_words(test_X[sample_idx]))

### Step 5: Build dictionary

In [None]:
word_dict, sorted_words, word_count = build_dict(train_pp_X, vocab_size=vocab_size)

#### Check word dictionary

In [None]:
pd.DataFrame(word_count.items(), columns = ["word", "count"]).sort_values(by="count",ascending=False).head()

#### Update word dictionary

In [None]:
words_to_remove = [] # ["movi", "film"]

In [None]:
updated_dict, new_vocab_size = update_save_dict(word_dict, pytorch_dir, words_to_remove)

#### Convert and pad data

In [None]:
train_tf_X, train_X_tf_len = convert_and_pad_data(updated_dict, train_pp_X)

In [None]:
test_tf_X, test_X_tf_len = convert_and_pad_data(updated_dict, test_pp_X)

In [None]:
valid_tf_X, valid_X_tf_len = convert_and_pad_data(updated_dict, valid_pp_X)

### Step 6: Local Training

In [None]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_tf_len), pd.DataFrame(train_tf_X)], axis=1) \
        .to_csv(os.path.join(pytorch_dir, 'train.csv'), header=False, index=False)

pd.concat([pd.DataFrame(valid_y), pd.DataFrame(valid_X_tf_len), pd.DataFrame(valid_tf_X)], axis=1) \
        .to_csv(os.path.join(pytorch_dir, 'valid.csv'), header=False, index=False)

In [None]:
# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(pytorch_dir, 'train.csv'), header=None, names=None, nrows=250)
valid_sample = pd.read_csv(os.path.join(pytorch_dir, 'valid.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

valid_sample_y = torch.from_numpy(valid_sample[[0]].values).float().squeeze()
valid_sample_X = torch.from_numpy(valid_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
valid_sample_ds = torch.utils.data.TensorDataset(valid_sample_X, valid_sample_y)

# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)
valid_sample_dl = torch.utils.data.DataLoader(valid_sample_ds, batch_size=50)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, new_vocab_size).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

report_train, trained_model = train(model, train_sample_dl, valid_sample_dl, 15, optimizer, loss_fn, device)

### Step 7: Sagemaker Training

#### Initial setup

In [None]:
sagemaker_session = sagemaker.Session(boto_session=boto3.session.Session(region_name=region))

bucket = sagemaker_session.default_bucket()

role = account_role

#### Upload data

In [None]:
input_data = sagemaker_session.upload_data(path=pytorch_dir, bucket=bucket, key_prefix=bucket_prefix)

#### Create estimator

In [None]:
estimator = PyTorch(entry_point="train.py",
                    source_dir="enhanced",
                    py_version="py3",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    hyperparameters={"vocab_size": new_vocab_size,
                                     "hidden_dim": 200})

In [None]:
estimator_tuner = HyperparameterTuner(estimator = estimator,
                                      objective_metric_name = 'validation:loss',
                                      metric_definitions = [{"Name": "validation:loss","Regex": "BCELoss: (.*?);"}],
                                      objective_type = 'Minimize',
                                      max_jobs = 1, # The total number of models to train
                                      max_parallel_jobs = 1, # The number of models to train in parallel
                                      hyperparameter_ranges = {
                                          'epochs': IntegerParameter(10,11),
                                          
                                      }
                                     )

#### Tunning job

In [None]:
estimator_tuner.fit({'training': input_data})

In [None]:
estimator_tuner.wait()

#### Create deploy

In [None]:
best_model = estimator_tuner.best_training_job()

In [None]:
best_estimator = sagemaker.estimator.Estimator.attach(best_model)

### Step 8: Sagemaker Deploy

In [None]:
class StringPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')

model = PyTorchModel(model_data=best_estimator.model_data,
                     role = role,
                     framework_version='0.4.0',
                     entry_point='predict.py',
                     py_version="py3",
                     source_dir='serve',
                     predictor_cls=StringPredictor)

In [None]:
endpoint = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
endpoint.endpoint

#### Predict data

In [None]:
ground, results = test_reviews(endpoint.endpoint, stop=250)

#### Evaluate results

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
roc_auc = roc_auc_score(ground, results)
fpr, tpr, _ = roc_curve(ground, results)

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

### Finish

In [None]:
endpoint.delete_endpoint()