## Install huggingface transformers

In [1]:
!pip install transformers
!pip install pytorch-nlp


[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import torch                                        # root package
from torch.utils.data import Dataset, DataLoader    # dataset representation and loading


In [3]:
import os
import boto3
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
print(role)

arn:aws:iam::733425554560:role/service-role/AmazonSageMaker-ExecutionRole-20200504T094270


## 1. Obtain dataset

### We don't want to use the prepared dataset as it, wo we compare the sample dataset into csv

In [18]:
import csv
from torchnlp.datasets import imdb_dataset
train, test = imdb_dataset(train=True,test=True)

with open('data/train.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
    csvwriter.writerow(['text','sentiment'])
    for i in train:
        csvwriter.writerow([i['text'],i['sentiment']])

with open('data/test.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
    csvwriter.writerow(['text','sentiment'])
    for i in test:
        csvwriter.writerow([i['text'],i['sentiment']])

In [19]:
import pandas as pd
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
mapping = {'neg': 0, 'pos': 1}
train_df = train_df.replace({'sentiment': mapping})
test_df = test_df.replace({'sentiment': mapping})

In [20]:
train_df.sample(10)

Unnamed: 0,text,sentiment
7082,The only reason I'm giving this a 9 is that th...,1
4755,This movie is to Halloween what the hilarious ...,1
17922,"I had some expectation for the movie, since it...",0
20369,Having read during many years about how great ...,0
14677,Ludicrous violations of the most basic securit...,0
13590,"Don't get me wrong, the movie is beautiful, th...",0
9905,<br /><br />Arriving by boxcar in New York Cit...,1
3211,"Even though it has one of the standard ""Reveng...",1
16520,Tim (Gary Daniels) wants desperately to break ...,0
8308,I can remember this movie from when i was a sm...,1


In [21]:
test_df.sample(10)

Unnamed: 0,text,sentiment
18740,May be I don't get it right. I mean the movie....,0
6598,"Bullets may not have bounced off his chest, bu...",1
19165,"When Family Guy first premiered, I was not in ...",0
17192,Pretty visuals and a lot of fights make not a ...,0
15029,I'm thinking of some things for this movie: Fi...,0
19574,"When The Spirits Within was released, all you ...",0
15814,Being a fan of the game and watching this film...,0
22423,Ghost Story has an interesting feminist reveng...,0
16956,Is this movie as bad as some claim? In my opin...,0
4501,The movie is a starter to what really happened...,1


In [8]:
train_df = train_df.sample(int(len(train_df)*0.1))
test_df = test_df.sample(int(len(test_df)*0.1))

In [22]:
train_sentences = train_df.text.values
train_labels = train_df.sentiment.values
test_sentences = test_df.text.values
test_labels = test_df.sentiment.values

In [23]:
import numpy as np
os.makedirs("./datasets/train", exist_ok=True)
np.save("./datasets/train/train_sentences.npy", train_sentences)
np.save("./datasets/train/train_labels.npy", train_labels)
os.makedirs("./datasets/test", exist_ok=True)
np.save("./datasets/test/test_sentences.npy", test_sentences)
np.save("./datasets/test/test_labels.npy", test_labels)


In [24]:
BUCKET_NAME = sess.default_bucket()
PREFIX = 'bert-classification-janossch'

traindata_s3_prefix = f"{PREFIX}/datasets/train"
testdata_s3_prefix = f"{PREFIX}/datasets/test"
output_s3 = f"s3://{BUCKET_NAME}/{PREFIX}/models/"

In [25]:
train_s3 = sess.upload_data(path="./datasets/train/", bucket=BUCKET_NAME, key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path="./datasets/test/", bucket=BUCKET_NAME, key_prefix=testdata_s3_prefix)

In [26]:
train_channel = f"s3://{BUCKET_NAME}/{PREFIX}/datasets/train/"
test_channel = f"s3://{BUCKET_NAME}/{PREFIX}/datasets/test/"

In [27]:
from sagemaker.pytorch.estimator import PyTorch as PyTorchEstimator
estimator = PyTorchEstimator(
    entry_point="janossch-train.py",
    source_dir="src",
    
    base_job_name="bert-classification",
    output_path=f"s3://{BUCKET_NAME}/{PREFIX}/",
    
    framework_version="1.4.0",
    py_version="py3",
    
    role=role,
    train_instance_count=1,
    train_instance_type="local_gpu",
    train_max_run=60*60,
    train_max_wait=60*60,
    
    hyperparameters={
        "seed": 4711,
        "log_level": "DEBUG",
        'batch-size': 32,
    }
)

In [28]:
estimator.fit({ "train": train_channel, "test": test_channel })

Creating tmpot1c5y9f_algo-1-np25x_1 ... 
[1BAttaching to tmpot1c5y9f_algo-1-np25x_12mdone[0m
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,750 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,774 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,777 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,911 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. 
[36malgo-1-np25x_1  |[0m Generating setup.py
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,911 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,912 sagemaker-containers INFO     Generating MANIFEST.in
[36malgo-1-np25x_1  |[0m 2020-05-06 08:50:37,912 sagemaker-containers INFO     Installing module with t

[36malgo-1-np25x_1  |[0m There are 1 GPU(s) available.
[36malgo-1-np25x_1  |[0m We will use the GPU: Tesla K80
[36malgo-1-np25x_1  |[0m 
[36malgo-1-np25x_1  |[0m Training...
[36malgo-1-np25x_1  |[0m   Batch    40  of    782.    Elapsed: 0:00:22.
[36malgo-1-np25x_1  |[0m   Batch    80  of    782.    Elapsed: 0:00:43.
[36malgo-1-np25x_1  |[0m   Batch   120  of    782.    Elapsed: 0:01:05.
[36malgo-1-np25x_1  |[0m   Batch   160  of    782.    Elapsed: 0:01:27.
[36malgo-1-np25x_1  |[0m   Batch   200  of    782.    Elapsed: 0:01:48.
[36malgo-1-np25x_1  |[0m   Batch   240  of    782.    Elapsed: 0:02:10.
[36malgo-1-np25x_1  |[0m   Batch   280  of    782.    Elapsed: 0:02:32.
[36malgo-1-np25x_1  |[0m   Batch   320  of    782.    Elapsed: 0:02:54.
[36malgo-1-np25x_1  |[0m   Batch   360  of    782.    Elapsed: 0:03:15.
[36malgo-1-np25x_1  |[0m   Batch   400  of    782.    Elapsed: 0:03:37.
[36malgo-1-np25x_1  |[0m   Batch   440  of    782.    Elapsed: 0:03:59.
[36