### Load Packages

In [1]:
import os
import pandas as pd

In [62]:
from enhanced.utils import download_data
from enhanced.preparation import read_imdb_data, prepare_imdb_data, split_train_validation_data
from enhanced.processing import review_to_words, preprocess_data, build_dict, update_dict

ImportError: cannot import name 'update_dict' from 'enhanced.processing' (/home/aadelmo/projetos/udacity/UdacitySentimentAnalysis/enhanced/processing.py)

### Configurations

In [26]:
# Raw data parameters
data_dir = "data"
raw_folder = "aclImdb"
data_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filename = "aclImdb_v1.tar.gz"

vocab_size = 5000
sample_idx = 42
cache_dir = os.path.join("cache", "sentiment_analysis")
pytorch_dir = 'data/pytorch'

### Step 1: Downloading the data

In [4]:
download_data(data_dir, data_url, filename)

data already downloaded


### Step 2: Processing raw data

In [5]:
data, labels = read_imdb_data(data_dir, raw_folder)

In [6]:
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


### Step 3: Split train-validation-test data

In [7]:
train_valid_X, test_X, train_valid_y, test_y = prepare_imdb_data(data, labels)

In [8]:
train_X, valid_X, train_y, valid_y = split_train_validation_data(train_valid_X, train_valid_y, valid_size=0.3)

In [9]:
print("IMDb reviews (combined): train = {}, valid = {}, test = {}".format(len(train_X), len(valid_X),len(test_X)))

IMDb reviews (combined): train = 17500, valid = 7500, test = 25000


#### Check reviews

In [10]:
print(f"sentiment: {train_y[sample_idx]}")
print(f"review: {train_X[sample_idx]}")

sentiment: 0
review: I went to see it in hopes of some good old fashioned Alice Entertainment.Once I realized I would not be getting that,I watched it for a pretty well made movie (in terms of filming,and yeah..that was it).But aside from it having a good film quality,considering I had been watching grainy movies all day long,there was nothing good about that movie.<br /><br />He killed 42.Why were Tweedle Dee and Dum played by Mudler and Scully?Serisouly,Who can answer that for me?Who can answer anything awful about this movie for me.<br /><br />I agree with whoever said it was just one big long inside joke for the staff.That's all it seemed to be.<br /><br />Poor Mr.Carroll.I'm so sorry somebody did that to his wonderful tales.


In [11]:
print(f"sentiment: {valid_y[sample_idx]}")
print(f"review: {valid_X[sample_idx]}")

sentiment: 0
review: Friday the 13th step over! There is officially a worse movie than your hateful series out there. I won this movie in a contest at college, and it was a waste of money even if it was free. Jack Jones stars as a truly awful singer whose trying to find some murderers or something. At least Friday the Thirteenth never bored me. I'd rather have my fingernails pulled than see this again.


In [12]:
print(f"sentiment: {test_y[sample_idx]}")
print(f"review: {test_X[sample_idx]}")

sentiment: 1
review: By many accounts, Stu Ungar was not a very nice guy. He spat on dealers, stiffed people he owed money to, and was verbally abusive. <br /><br />Many filmmakers might choose to sugarcoat the man, making him into some sports hero that would triumph despite adversity. But High Roller doesn't do that. And that's a tough row to hoe.<br /><br />Instead, we have to look VERY closely to see a man that never matured passed the frightened little boy from the streets of New York, despite all his successes. And the only real approval he ever gets is from death himself. Very brave (because people won't get it) and very touching (when you do).<br /><br />What is also brave is the use of a Scorsese feel. "Aha! How derivative," people will say. Really? But there's virtually no violence. And Stuey LOVED gangster movies. Maybe the feel reflects the man Stu and not the director Marty? And if it really is a low budget film and looks that good, bravo!<br /><br />Finally, the linear fla

### Step 4: Processing data

In [16]:
train_pp_X, test_pp_X, valid_pp_X, train_pp_y, test_pp_y, valid_pp_y = preprocess_data(train_X, test_X, valid_X,
                                                                           train_y, test_y, valid_y, cache_dir)

Read preprocessed data from cache file: preprocessed_data.pkl


#### Check tokenization

In [18]:
print(review_to_words(train_X[sample_idx]))

['went', 'see', 'hope', 'good', 'old', 'fashion', 'alic', 'entertain', 'realiz', 'would', 'get', 'watch', 'pretti', 'well', 'made', 'movi', 'term', 'film', 'yeah', 'asid', 'good', 'film', 'qualiti', 'consid', 'watch', 'graini', 'movi', 'day', 'long', 'noth', 'good', 'movi', 'kill', '42', 'tweedl', 'dee', 'dum', 'play', 'mudler', 'sculli', 'serisouli', 'answer', 'answer', 'anyth', 'aw', 'movi', 'agre', 'whoever', 'said', 'one', 'big', 'long', 'insid', 'joke', 'staff', 'seem', 'poor', 'mr', 'carrol', 'sorri', 'somebodi', 'wonder', 'tale']


In [19]:
print(review_to_words(valid_X[sample_idx]))

['friday', '13th', 'step', 'offici', 'wors', 'movi', 'hate', 'seri', 'movi', 'contest', 'colleg', 'wast', 'money', 'even', 'free', 'jack', 'jone', 'star', 'truli', 'aw', 'singer', 'whose', 'tri', 'find', 'murder', 'someth', 'least', 'friday', 'thirteenth', 'never', 'bore', 'rather', 'fingernail', 'pull', 'see']


In [20]:
print(review_to_words(test_X[sample_idx]))

['mani', 'account', 'stu', 'ungar', 'nice', 'guy', 'spat', 'dealer', 'stif', 'peopl', 'owe', 'money', 'verbal', 'abus', 'mani', 'filmmak', 'might', 'choos', 'sugarcoat', 'man', 'make', 'sport', 'hero', 'would', 'triumph', 'despit', 'advers', 'high', 'roller', 'tough', 'row', 'hoe', 'instead', 'look', 'close', 'see', 'man', 'never', 'matur', 'pass', 'frighten', 'littl', 'boy', 'street', 'new', 'york', 'despit', 'success', 'real', 'approv', 'ever', 'get', 'death', 'brave', 'peopl', 'get', 'touch', 'also', 'brave', 'use', 'scorses', 'feel', 'aha', 'deriv', 'peopl', 'say', 'realli', 'virtual', 'violenc', 'stuey', 'love', 'gangster', 'movi', 'mayb', 'feel', 'reflect', 'man', 'stu', 'director', 'marti', 'realli', 'low', 'budget', 'film', 'look', 'good', 'bravo', 'final', 'linear', 'flashback', 'structur', 'wow', 'get', 'hammer', 'yet', 'work', 'work', 'except', 'well', 'even', 'see', 'connect', 'seventh', 'seal', 'proof', 'ss', 'knight', 'play', 'game', 'chess', 'death', 'hr', 'stuey', 'say'

### Step 5: Build dictionary

In [34]:
word_dict, sorted_words, word_count = build_dict(train_pp_X, vocab_size=vocab_size)

#### Check word dictionary

In [28]:
pd.DataFrame(word_count.items(), columns = ["word", "count"]).sort_values(by="count",ascending=False).head()

Unnamed: 0,word,count
26298,movi,36020
14025,film,33312
28101,one,19170
22800,like,16026
39527,time,11449


#### Update word dictionary

In [57]:
words_to_remove = ["movi", "film"]

In [59]:
updated_dict, new_vocab_size = update_dict(word_dict, words_to_remove)

In [None]:
if not os.path.exists(pytorch_dir): # Make sure that the folder exists
    os.makedirs(pytorch_dir)

In [None]:
with open(os.path.join(pytorch_dir, 'word_dict_enhanced.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [None]:
train_tf_X, train_X_tf_len = convert_and_pad_data(word_dict, train_pp_X)

In [None]:
test_tf_X, test_X_tf_len = convert_and_pad_data(word_dict, test_pp_X)

In [None]:
valid_tf_X, valid_X_tf_len = convert_and_pad_data(word_dict, valid_pp_X)

In [None]:
import pandas as pd
    
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_tf_len), pd.DataFrame(train_tf_X)], axis=1) \
        .to_csv(os.path.join(pytorch_dir, 'train.csv'), header=False, index=False)

pd.concat([pd.DataFrame(valid_y), pd.DataFrame(valid_X_tf_len), pd.DataFrame(valid_tf_X)], axis=1) \
        .to_csv(os.path.join(pytorch_dir, 'valid.csv'), header=False, index=False)

In [None]:
import boto3
import sagemaker

sagemaker_session = sagemaker.Session(boto_session=boto3.session.Session(region_name="us-east-1"))

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

try:
    role = sagemaker.get_execution_role()
except Exception as err:
    role = "arn:aws:iam::977053370764:role/service-role/AmazonSageMaker-ExecutionRole-20201202T141643"

In [None]:
input_data = sagemaker_session.upload_data(path=pytorch_dir, bucket=bucket, key_prefix=prefix)
input_data

In [None]:
import torch
import torch.utils.data

# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(pytorch_dir, 'train.csv'), header=None, names=None, nrows=250)
valid_sample = pd.read_csv(os.path.join(pytorch_dir, 'valid.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

valid_sample_y = torch.from_numpy(valid_sample[[0]].values).float().squeeze()
valid_sample_X = torch.from_numpy(valid_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
valid_sample_ds = torch.utils.data.TensorDataset(valid_sample_X, valid_sample_y)

# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)
valid_sample_dl = torch.utils.data.DataLoader(valid_sample_ds, batch_size=50)

In [None]:
%matplotlib inline
import time
import pylab as pl
from IPython import display

def train(model, train_loader, valid_loader, epochs, optimizer, loss_fn, device):
    report_train = pd.DataFrame(columns=["epoch", "trainError", "validError"])
    best_valid_BCELoss = 9999999999
    BCELoss_list = []
    valid_BCELoss_list = []
    for epoch in tqdm(range(1, epochs + 1)):
        model.train()
        total_loss = 0
        total_valid_loss = 0        
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # TODO: Complete this train method to train the model provided.
            optimizer.zero_grad()
            output = model(batch_X)
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()            
            total_loss += loss.data.item()
        for block in valid_loader:     
            block_X, block_y = block
            
            block_X = block_X.to(device)
            block_y = block_y.to(device)
            output_valid = model(block_X)
            valid_loss = loss_fn(output_valid, block_y)
            total_valid_loss += valid_loss.data.item()
        BCELoss = total_loss/len(train_loader)
        BCELoss_list.append(BCELoss)
        valid_BCELoss = total_valid_loss/len(valid_loader)
        valid_BCELoss_list.append(valid_BCELoss)
        if valid_BCELoss < best_valid_BCELoss: 
            #dummy_input = torch.tensor(block_X).to(device).long()
            #torch.onnx.export(model, dummy_input, f"models/best_model.onnx")
            best_valid_BCELoss = BCELoss
        
        desc = (f'Epoch: {epoch}, train_loss: {BCELoss}, valid_loss: {valid_BCELoss}')
        print(desc)
        to_append = [epoch, BCELoss, valid_BCELoss]
        report_train_length = len(report_train)
        report_train.loc[report_train_length] = to_append
        #print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))
        display.clear_output(wait=True)
        if epoch == 1:
            pl.plot(BCELoss_list, '-b', label="TrainError")
            pl.plot(valid_BCELoss_list, '-r', label="ValidationError")
            pl.legend(loc='upper right')
        else:
            pl.plot(BCELoss_list, '-b')
            pl.plot(valid_BCELoss_list, '-r')
        pl.xlim(1, epochs)
        pl.ylim(0, 2)
        display.display(pl.gcf())
        time.sleep(1.0)
    return report_train, model

In [None]:
import torch.optim as optim
from train.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

report_train, trained_model = train(model, train_sample_dl, valid_sample_dl, 5, optimizer, loss_fn, device)

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="enhanced",
                    py_version="py3",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
 hyperparameters={
                        'epochs': 20,
                        'hidden_dim': 200,
     "vocab_size": new_vocab_size
                    })

In [None]:
estimator.fit({'training': input_data})

In [None]:
predictor = estimator.deploy(initial_instance_count=1,instance_type="ml.m4.xlarge")

In [None]:
test_X_pred = pd.concat([pd.DataFrame(test_X_tf_len), pd.DataFrame(test_tf_X)], axis=1)

In [None]:
# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in tqdm(split_array):
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

In [None]:
predictions = predict(test_X_pred.values)
predictions = [round(num) for num in predictions]

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(test_y, predictions)

In [None]:
confusion_matrix(test_y, predictions)

In [None]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

In [None]:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = estimator, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:loss',
                                               metric_definitions = [{"Name": "validation:loss","Regex": "BCELoss: (.*?);"}],# The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 4, # The total number of models to train
                                               max_parallel_jobs = 1, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'epochs': IntegerParameter(10,11),
                                                    'hidden_dim': IntegerParameter(200,201),
                                                    #'embedding_dim': IntegerParameter(32,40)
                                               })

In [None]:
xgb_hyperparameter_tuner.fit({'training': input_data})

In [None]:
xgb_hyperparameter_tuner.wait()

In [None]:
best_model = xgb_hyperparameter_tuner.best_training_job()

In [None]:
best_estimator = sagemaker.estimator.Estimator.attach(best_model)

In [None]:
predictor = best_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
from sagemaker.predictor import RealTimePredictor
from sagemaker.pytorch import PyTorchModel

class StringPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')

model = PyTorchModel(model_data=best_estimator.model_data,
                     role = role,
                     framework_version='0.4.0',
                     entry_point='predict.py',
                     py_version="py3",
                     source_dir='serve',
                     predictor_cls=StringPredictor)

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
test_X = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)

In [None]:
# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in tqdm(split_array):
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

In [None]:
import glob
import os
from tqdm import tqdm

def test_reviews(data_dir='data/aclImdb', stop=5000):
    
    results = []
    ground = []
    
    # We make sure to test both positive and negative reviews    
    for sentiment in ['pos', 'neg']:
        
        path = os.path.join(data_dir, 'test', sentiment, '*.txt')
        files = glob.glob(path)
        
        files_read = 0
        
        print('Starting ', sentiment, ' files')
        
        # Iterate through the files and send them to the predictor
        for f in tqdm(files):
            with open(f) as review:
                # First, we store the ground truth (was the review positive or negative)
                try:
                    # Read in the review and convert to 'utf-8' for transmission via HTTP
                    review_input = review.read().encode('utf-8')
                    # Send the review to the predictor and store the results
                    results.append(float(predictor.predict(review_input)))
                    if sentiment == 'pos':
                        ground.append(1)
                    else:
                        ground.append(0)
                except:
                    pass
                #print(results)
            # Sending reviews to our endpoint one at a time takes a while so we
            # only send a small number of reviews
            files_read += 1
            if files_read == stop:
                break
            
    return ground, results

In [None]:
ground, results = test_reviews()

for idx in tqdm(range(n_test_sample)):
    pr_data_X = test_tf_X[idx]
    pr_data_len = test_X_tf_len[idx]
    pr_data_pack = np.hstack((pr_data_len, pr_data_X))
    pr_data_pack = pr_data_pack.reshape(1, -1)
    pr_data = torch.from_numpy(pr_data_pack)
    pr_data = pr_data.to(device)
    model.eval()
    result = model(pr_data).detach().numpy()
    predicted_values.append(result)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
roc_auc = roc_auc_score(ground, results)
fpr, tpr, _ = roc_curve(ground, results)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(ground, np.rint(results))

In [None]:
confusion_matrix(ground, np.rint(results))