## Setup environment

In [2]:
# For easier dev of local modules:
%load_ext autoreload
%autoreload 2

# Python Built-Ins:

# External Dependencies:
import boto3
import sagemaker
from sagemaker.pytorch.estimator import PyTorch as PyTorchEstimator


In [3]:
BUCKET_NAME = "2020-05-gym-bert"
%store BUCKET_NAME

Stored 'BUCKET_NAME' (str)


In [218]:
role = sagemaker.get_execution_role()

sess = sagemaker.Session()
botosess = boto3.session.Session()
region = botosess.region_name
s3 = botosess.resource("s3")
bucket = s3.Bucket(BUCKET_NAME)
smclient = botosess.client("sagemaker")

In [9]:
!wget 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

--2020-05-05 04:42:34--  https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
Resolving nyu-mll.github.io (nyu-mll.github.io)... 185.199.110.153, 185.199.111.153, 185.199.108.153, ...
Connecting to nyu-mll.github.io (nyu-mll.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255330 (249K) [application/zip]
Saving to: ‘cola_public_1.1.zip’


2020-05-05 04:42:34 (24.1 MB/s) - ‘cola_public_1.1.zip’ saved [255330/255330]



In [10]:
!unzip cola_public_1.1.zip

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [144]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("./cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)


Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
8455,ad03,1,,Frieda closed the door
7486,sks13,1,,John thinks that Bill left.
2151,l-93,0,*,Door frames hit easily.
3134,l-93,1,,Paul laughed.
758,bc01,1,,We proved Smith conclusively to be the thief.
5143,ks08,1,,It was the director that she said she wants to...
4393,ks08,1,,Mary was solving the problem.
1740,r-67,1,,"They said that Tom would pay up, and pay up I ..."
2626,l-93,0,*,The king banished the general from the palace ...
1665,r-67,0,*,"Fluffy is sick, as nobody knows."


In [145]:
data=df[['label', 'label_notes','sentence']]

In [146]:
data=data.reset_index()

In [147]:
data=data.drop('index',axis=1)

In [149]:
train=data.sample(frac=0.8,random_state=200) #random state is a seed value
val=data.drop(train.index)

In [209]:
!mkdir 'data'

In [211]:
!mkdir './data/train/' './data/val/'

In [212]:
train.to_csv('./data/train/train.tsv',sep='\t', index=True,header=None)
val.to_csv('./data/val/dev.tsv',sep='\t', index=True,header=None)

In [214]:
train_filename = './data/train/train.tsv'
val_filename = '/data/val/dev.tsv'

In [242]:
traindata_s3_prefix = "classification_data/train"
valdata_s3_prefix = "classification_data/val"

In [243]:
train_channel = sess.upload_data(path="./data/train/", bucket=BUCKET_NAME, key_prefix=traindata_s3_prefix)
val_channel = sess.upload_data(path="./data/val/", bucket=BUCKET_NAME, key_prefix=valdata_s3_prefix)


In [253]:
val_channel

's3://2020-05-gym-bert/classification_data/val'

In [292]:
estimator = PyTorchEstimator(
    entry_point="train2.py",
    source_dir="src",

    base_job_name="distilbert-calssification-distributed",
    #checkpoint_s3_uri=f"s3://{BUCKET_NAME}/checkpoints",
    output_path=f"s3://{BUCKET_NAME}/",

    framework_version="1.4.0",
    py_version="py3",

    role=role,
    train_instance_count=2,
    train_instance_type="ml.p3.2xlarge", # "ml.p2.xlarge",
    train_volume_size=50,
    #train_max_run=60*60,
    #train_max_wait=60*60,
    # Checkpoint saving might be part-working but resume definitely isn't yet:
    #train_use_spot_instances=True,
    #output_path =
    hyperparameters={
        "seed": 1337,
        "per_gpu_train_batch_size":16,
        "per_gpu_eval_batch_size":16,
        "num_train_epochs":10,
        "logging_steps": 250,
        "save_steps":250,
        "model_type":"distilbert",
        "model_name":"distilbert-base-uncased",
        "config_name":"distilbert-base-uncased",
        "tokenizer_name":"distilbert-base-uncased"
        
        #"log-level": "ERROR",
    },
)

In [293]:
estimator.fit({ "train": train_channel, "validation": val_channel })

2020-05-05 16:24:56 Starting - Starting the training job...
2020-05-05 16:24:57 Starting - Launching requested ML instances...
2020-05-05 16:25:56 Starting - Preparing the instances for training.........
2020-05-05 16:27:25 Downloading - Downloading input data
2020-05-05 16:27:25 Training - Downloading the training image........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-05-05 16:28:34,147 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-05-05 16:28:34,171 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m

2020-05-05 16:28:47 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-05-05 16:28:48,716 sagemaker-containers INFO     Imported framework sage

[35mInstalling collected packages: filelock, sentencepiece, tokenizers, regex, sacremoses, transformers, tensorboardX, default-user-module-name[0m
[35mSuccessfully installed default-user-module-name-1.0.0 filelock-3.0.12 regex-2020.4.4 sacremoses-0.0.43 sentencepiece-0.1.86 tensorboardX-2.0 tokenizers-0.5.2 transformers-2.8.0[0m
[35m2020-05-05 16:29:05,555 sagemaker-containers INFO     Invoking user script
[0m
[35mTraining Env:
[0m
[35m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "validation": "/opt/ml/input/data/validation",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-2",
    "framework_module": "sagemaker_pytorch_container.training:main",
    "hosts": [
        "algo-1",
        "algo-2"
    ],
    "hyperparameters": {
        "config_name": "distilbert-base-uncased",
        "seed": 1337,
        "model_type": "distilbert",
        "per_gpu_train_batch_size": 16,
        "num_train_epochs": 10,
        "mo

[34mFalse[0m
[34m['train.tsv'][0m
[35mFalse[0m
[35m['train.tsv'][0m
[34m***** Running training *****
  Num examples = %d 6841
  Num Epochs = %d 10.0
  Instantaneous batch size per GPU = %d 16
  Total train batch size (w. parallel, distributed & accumulation) = %d 16[0m
[34m[2020-05-05 16:30:01.072 algo-1:52 INFO json_config.py:90] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2020-05-05 16:30:01.072 algo-1:52 INFO hook.py:170] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.[0m
[34m[2020-05-05 16:30:01.072 algo-1:52 INFO hook.py:215] Saving to /opt/ml/output/tensors[0m
[34m[2020-05-05 16:30:01.088 algo-1:52 INFO hook.py:351] Monitoring the collections: losses[0m
[35m***** Running training *****
  Num examples = %d 6841
  Num Epochs = %d 10.0
  Instantaneous batch size per GPU = %d 16
  Total train batch size (w. parallel, distributed & accumulation) = %d 16[0m
[35m[2020-05-05 

[35m***** Running evaluation checkpoint-4250 *****
  Num examples = %d 1710
  Batch size = %d 16[0m
[35m***** Running evaluation checkpoint-500 *****
  Num examples = %d 1710
  Batch size = %d 16[0m
[34m***** Running evaluation checkpoint-750 *****
  Num examples = %d 1710
  Batch size = %d 16[0m
[35m***** Running evaluation checkpoint-750 *****
  Num examples = %d 1710
  Batch size = %d 16[0m
[34m***** Running evaluation  *****
  Num examples = %d 1710
  Batch size = %d 16[0m
[35m***** Running evaluation  *****
  Num examples = %d 1710
  Batch size = %d 16[0m
[34m{'acc_1000': 0.7894736842105263, 'acc_1250': 0.7824561403508772, 'acc_1500': 0.7818713450292397, 'acc_1750': 0.7830409356725146, 'acc_2000': 0.7801169590643274, 'acc_2250': 0.7847953216374269, 'acc_250': 0.7578947368421053, 'acc_2500': 0.7871345029239766, 'acc_2750': 0.7894736842105263, 'acc_3000': 0.7912280701754386, 'acc_3250': 0.7883040935672515, 'acc_3500': 0.7912280701754386, 'acc_3750': 0.7912280701754386, 

In [50]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [155]:
dataset=load_and_cache_examples('.', tokenizer, task='cola', model_type='bert', max_seq_length=100, overwrite_cache = True, evaluate=False)


In [153]:
train_file='train.tsv'

examples = []
quotechar=None
with open(train_file, "r", encoding="utf-8-sig") as f:
    csv_reader=csv.reader(f, delimiter="\t", quotechar=quotechar)
    for (i, line) in enumerate(csv_reader):
        guid = "%s-%s" % ('TRAIN', i)
        text_a = line[3]
        label = line[1]
        examples.append([guid, text_a, label])

In [154]:
examples

[['TRAIN-0', 'They play unusual music, and I listen to unusual music.', '1'],
 ['TRAIN-1', 'Loren was relied on by Pavarotti and Hepburn on by Bond.', '0'],
 ['TRAIN-2', 'I lent the book all the way to Tony.', '0'],
 ['TRAIN-3', 'Talked with Bill about the exam.', '0'],
 ['TRAIN-4', 'The tigers hunt prey at night.', '1'],
 ['TRAIN-5', 'Kim is resembled by the model in nearly every detail.', '0'],
 ['TRAIN-6', 'A dog lay in the comer.', '1'],
 ['TRAIN-7', 'I saw the Mona Lisa.', '1'],
 ['TRAIN-8',
  'The Gorgon is easy to believe the claim that Perseus slew.',
  '0'],
 ['TRAIN-9', 'The drooling dog kissed the big boy.', '1'],
 ['TRAIN-10', 'Objection was taken to the length of our skirts.', '1'],
 ['TRAIN-11', 'John must not do have eaten.', '0'],
 ['TRAIN-12', 'The president declared Smith press secretary.', '1'],
 ['TRAIN-13',
  'Medea thought that, after the executioner had left, Poseidon would be relieved.',
  '1'],
 ['TRAIN-14', 'Kelly reeked the onions.', '0'],
 ['TRAIN-15', 'Kare