## Install huggingface transformers

In [1]:
!pip install transformers
!pip install pytorch-nlp


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K    100% |████████████████████████████████| 573kB 16.5MB/s ta 0:00:01
Collecting regex!=2019.12.17 (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/1d/c1/c90beb2dbbfbf19f3634e16a441d5f11fa787bdf0748a35b8b88452c0e78/regex-2020.4.4-cp36-cp36m-manylinux1_x86_64.whl (679kB)
[K    100% |████████████████████████████████| 686kB 24.6MB/s ta 0:00:01
[?25hCollecting sentencepiece (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 22.7MB/s ta 0:00:01
[?25hCollecting sacremoses (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/

In [2]:
import torch                                        # root package
from torch.utils.data import Dataset, DataLoader    # dataset representation and loading


In [3]:
import os
import boto3
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
print(role)

arn:aws:iam::733425554560:role/service-role/AmazonSageMaker-ExecutionRole-20200504T094270


## 1. Obtain dataset

### We don't want to use the prepared dataset as it, wo we compare the sample dataset into csv

In [4]:
import csv
from torchnlp.datasets import imdb_dataset
train, test = imdb_dataset(train=True,test=True)

with open('data/train.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
    csvwriter.writerow(['text','sentiment'])
    for i in train:
        csvwriter.writerow([i['text'],i['sentiment']])

with open('data/test.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
    csvwriter.writerow(['text','sentiment'])
    for i in test:
        csvwriter.writerow([i['text'],i['sentiment']])

In [5]:
import pandas as pd
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
mapping = {'neg': 0, 'pos': 1}
train_df = train_df.replace({'sentiment': mapping})
test_df = test_df.replace({'sentiment': mapping})

In [6]:
train_df.sample(10)

Unnamed: 0,text,sentiment
3460,"""Sasquatch Hunters"" actually wasn't as bad as ...",1
17539,"I don't understand how ""2 of us"" receive such ...",0
13966,Neat premise. Very unrealistic. What I learned...,0
9461,Does anyone else cry tears of joy when they wa...,1
19198,There is absolutely nothing in this movie that...,0
18,"From the mind of Robert Bloch, of ""Psycho"" fam...",1
10375,"I second the motion to make this into a movie,...",1
15459,I caught this stink bomb of a movie recently o...,0
9437,"Visconti's first feature, Ossessione is an ada...",1
22616,Nightmare Weekend stars a cast of ridiculous a...,0


In [7]:
test_df.sample(10)

Unnamed: 0,text,sentiment
9419,"Ben (a fine Charles Bateman), his young daught...",1
15019,This is only the fourth effort I’ve watched fr...,0
3862,While some of the things in Haggard are dumb a...,1
8740,"First, a word of caution. The DVD box describe...",1
9547,"Multiply named and strangely casted, ""One Dark...",1
993,I must admit that I didn't get around to seein...,1
24200,"I love comedies and I love independent films, ...",0
22979,"I'm usually quite tolerant of movies, and very...",0
8928,I remember this movie getting a lot of flak fr...,1
24856,The plot was not good.<br /><br />The special ...,0


In [8]:
train_df = train_df.sample(int(len(train_df)*0.1))
test_df = test_df.sample(int(len(test_df)*0.1))

In [9]:
train_sentences = train_df.text.values
train_labels = train_df.sentiment.values
test_sentences = test_df.text.values
test_labels = test_df.sentiment.values

In [10]:
import numpy as np
os.makedirs("./datasets/train", exist_ok=True)
np.save("./datasets/train/train_sentences.npy", train_sentences)
np.save("./datasets/train/train_labels.npy", train_labels)
os.makedirs("./datasets/test", exist_ok=True)
np.save("./datasets/test/test_sentences.npy", test_sentences)
np.save("./datasets/test/test_labels.npy", test_labels)


In [11]:
BUCKET_NAME = sess.default_bucket()
PREFIX = 'bert-classification-janossch'

traindata_s3_prefix = f"{PREFIX}/datasets/train"
testdata_s3_prefix = f"{PREFIX}/datasets/test"
output_s3 = f"s3://{BUCKET_NAME}/{PREFIX}/models/"

In [12]:
train_s3 = sess.upload_data(path="./datasets/train/", bucket=BUCKET_NAME, key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path="./datasets/test/", bucket=BUCKET_NAME, key_prefix=testdata_s3_prefix)

### Tokenize Dataset (moved to the training script)

In [13]:
train_channel = f"s3://{BUCKET_NAME}/{PREFIX}/datasets/train/"
test_channel = f"s3://{BUCKET_NAME}/{PREFIX}/datasets/test/"

In [25]:
from sagemaker.pytorch.estimator import PyTorch as PyTorchEstimator
estimator = PyTorchEstimator(
    entry_point="janossch-train.py",
    source_dir="src",
    
    base_job_name="bert-classification",
    output_path=f"s3://{BUCKET_NAME}/{PREFIX}/",
    
    framework_version="1.4.0",
    py_version="py3",
    
    role=role,
    train_instance_count=1,
    train_instance_type="local_gpu",
    train_max_run=60*60,
    train_max_wait=60*60,
    
    hyperparameters={
        "seed": 4711,
        "log_level": "DEBUG",
        'batch-size': 32,
    }
)

In [26]:
estimator.fit({ "train": train_channel, "test": test_channel })

Creating tmp8f6bj_il_algo-1-tnt5x_1 ... 
[1BAttaching to tmp8f6bj_il_algo-1-tnt5x_12mdone[0m
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:14,852 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:14,879 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:14,882 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:15,002 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. 
[36malgo-1-tnt5x_1  |[0m Generating setup.py
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:15,002 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:15,002 sagemaker-containers INFO     Generating MANIFEST.in
[36malgo-1-tnt5x_1  |[0m 2020-05-06 06:31:15,003 sagemaker-containers INFO     Installing module with t

[36malgo-1-tnt5x_1  |[0m There are 1 GPU(s) available.
[36malgo-1-tnt5x_1  |[0m We will use the GPU: Tesla K80
[36malgo-1-tnt5x_1  |[0m 
[36malgo-1-tnt5x_1  |[0m Training...
[36malgo-1-tnt5x_1  |[0m   Batch    40  of     79.    Elapsed: 0:00:23.
[36malgo-1-tnt5x_1  |[0m 
[36malgo-1-tnt5x_1  |[0m   Average training loss: 0.59
[36malgo-1-tnt5x_1  |[0m   Training epoch took: 0:00:45
[36malgo-1-tnt5x_1  |[0m 
[36malgo-1-tnt5x_1  |[0m Running Validation...
[36malgo-1-tnt5x_1  |[0m   Accuracy: 0.74
[36malgo-1-tnt5x_1  |[0m   Validation Loss: 0.02
[36malgo-1-tnt5x_1  |[0m   Validation took: 0:00:14
[36malgo-1-tnt5x_1  |[0m 
[36malgo-1-tnt5x_1  |[0m Training...
[36malgo-1-tnt5x_1  |[0m   Batch    40  of     79.    Elapsed: 0:00:23.
[36malgo-1-tnt5x_1  |[0m 
[36malgo-1-tnt5x_1  |[0m   Average training loss: 0.42
[36malgo-1-tnt5x_1  |[0m   Training epoch took: 0:00:45
[36malgo-1-tnt5x_1  |[0m 
[36malgo-1-tnt5x_1  |[0m Running Validation...
[36malgo-1-tn