# Bert text classification on SageMaker using PyTorch

This uses the dbpedia dataset

In [1]:
import sys, os
import logging

sys.path.append("src")

logging.basicConfig(level="INFO", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

### Bucket and role set up

In [2]:
import sagemaker
from sagemaker import get_execution_role
sm_session = sagemaker.session.Session()
role = get_execution_role()

In [3]:
data_bucket = sm_session.default_bucket()

data_bucket_prefix = "bert-demo"

s3_uri_data = "s3://{}/{}/data".format(data_bucket, data_bucket_prefix)
s3_uri_train = "{}/{}".format(s3_uri_data, "train.csv")
s3_uri_val = "{}/{}".format(s3_uri_data, "val.csv")
s3_uri_classes = "{}/{}".format(s3_uri_data, "classes.txt")

s3_uri_test = "{}/{}".format(s3_uri_data, "test.csv")

s3_output_path = "s3://{}/{}/output".format(data_bucket, data_bucket_prefix)
s3_code_path = "s3://{}/{}/code".format(data_bucket, data_bucket_prefix)

In [4]:
prepare_dataset = False

## Prepare dataset

In [5]:
tmp ="tmp"

In [6]:
%%bash -s  "$prepare_dataset"  "$s3_uri_test" "$s3_uri_classes" "$tmp"
   
prepare_dataset=$1
s3_test=$2
s3_classes=$3
tmp=$4

if [ "$prepare_dataset" == "True" ]
then  
    echo "Downloading data.."
    wget https://github.com/saurabh3949/Text-Classification-Datasets/raw/master/dbpedia_csv.tar.gz -P ${tmp}
    tar -xzvf ${tmp}/dbpedia_csv.tar.gz
    mv dbpedia_csv ${tmp}
    
    ls -l ${tmp}/dbpedia_csv/
    cat  ${tmp}/dbpedia_csv/classes.txt
    head -3 ${tmp}/dbpedia_csv/train.csv 
    
    echo aws s3 cp ${tmp}/dbpedia_csv/test.csv ${s3_test}
    aws s3 cp ${tmp}/dbpedia_csv/test.csv ${s3_test}
    
    aws s3 cp ${tmp}/dbpedia_csv/classes.txt ${s3_classes}
   
fi

#### Train val split

In [7]:
from sklearn.model_selection import train_test_split

def train_val_split(data_file, train_file_name = None, val_file_name = None, split_ratio=.30):
    with open(data_file, "r") as f:
        lines = f.readlines()
    train, val = train_test_split( lines, test_size=split_ratio, random_state=42)
    
    train_file_name = train_file_name or os.path.join(os.path.dirname(data_file), "train.csv")
    val_file_name = val_file_name or os.path.join(os.path.dirname(data_file), "val.csv")

    with open(train_file_name, "w") as f:
        f.writelines(train)
    
    with open(val_file_name, "w") as f:
        f.writelines(val)
        
    return train_file_name, val_file_name


In [8]:
if prepare_dataset:
    l_data_file = os.path.join(tmp, "dbpedia_csv", "train.csv")
    l_train, l_val = train_val_split(l_data_file, split_ratio=.30)

In [9]:
%%bash -s  "$prepare_dataset" "$s3_uri_train" "$s3_uri_val" "$l_train" "$l_val" "$tmp"
   
prepare_dataset=$1
s3_train=$2
s3_val=$3
l_train=$4
l_val=$5
tmp=$6

if [ "$prepare_dataset" == "True" ]
then  
    echo "Uploading data.."
    echo "Trainlines `wc -l ${l_train}`"
    echo "Vallines `wc -l ${l_val}`"
    
    head -3 ${l_train}
    head -3 ${l_val}
    
    echo aws s3 cp ${l_train} ${s3_train}
    aws s3 cp ${l_train} ${s3_train}
    
    echo aws s3 cp ${l_val} ${s3_val}
    aws s3 cp ${l_val} ${s3_val}
    rm -rf ${tmp}
fi

## Train

In [10]:
inputs =  {
    "train" : s3_uri_train,
    "val" : s3_uri_val,
    "class" : s3_uri_classes
}

In [11]:
hp = {
"epochs" : 50,
"earlystoppingpatience" : 3,
# Increasing batch size might end up with CUDA OOM error, increase grad accumulation instead
"batch" : 4,
"trainfile" :s3_uri_train.split("/")[-1],
"valfile" : s3_uri_val.split("/")[-1],
"classfile":s3_uri_classes.split("/")[-1],
# The number of steps to accumulate gradients for
"gradaccumulation" : 8,
"log-level":"INFO",
# This param depends on your model max pos embedding size or when large you might end up with CUDA OOM error    
"maxseqlen" : 512,
"lr":0.001,
"finetune": 0
}




{'epochs': 50,
 'earlystoppingpatience': 3,
 'batch': 4,
 'trainfile': 'train.csv',
 'valfile': 'val.csv',
 'classfile': 'classes.txt',
 'gradaccumulation': 8,
 'log-level': 'INFO',
 'maxseqlen': 512}

In [None]:
hp

In [13]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                    ,{"Name": "TrainScore",
                     "Regex": "###score: train_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationScore",
                     "Regex": "###score: val_score### (\d*[.]?\d*)"}
                    ]

In [14]:
job_type = "bert-classification"
base_name = "{}".format(job_type)

In [None]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
     #entry_point='main_train_k_fold.py',
    entry_point='main.py',
                    source_dir = 'src',
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    train_instance_count=1,
                    train_instance_type="ml.p3.2xlarge",
                    hyperparameters = hp,
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    #train_use_spot_instances = True
                    train_volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    base_job_name =base_name)

estimator.fit(inputs, wait=True)

2020-07-03 10:07:26,266 - sagemaker - INFO - Creating training-job with name: bert-classification-2020-07-03-10-07-25-988
2020-07-03 10:07:26 Starting - Starting the training job...
2020-07-03 10:07:28 Starting - Launching requested ML instances......
2020-07-03 10:08:31 Starting - Preparing the instances for training...
2020-07-03 10:09:20 Downloading - Downloading input data...
2020-07-03 10:09:31 Training - Downloading the training image........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-07-03 10:11:07,731 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-07-03 10:11:07,753 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-07-03 10:11:08,366 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-07-03 10:11:08,629 sagemaker-containers INFO     Module de

In [12]:

# import sys
# import logging

# sys.path.append("src")

# logging.basicConfig(level="INFO", handlers=[logging.StreamHandler(sys.stdout)],
#                         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# from builder import Builder
# import os

# checkpoint_dir = None
# epochs = 10
# earlystoppingpatience = 1
# modeldir = "."
# batch_size = 5

# train_data_file = os.path.join("tmp/dbpedia_csv", "train.csv")
# val_data_file = os.path.join("tmp/dbpedia_csv", "val.csv")
# labels_file = os.path.join("tmp/dbpedia_csv", "classes.txt")
# b = Builder(train_data=train_data_file, val_data=val_data_file, labels_file = labels_file,
#             checkpoint_dir=checkpoint_dir, epochs=epochs,
#             early_stopping_patience=earlystoppingpatience, batch_size= batch_size)

# trainer = b.get_trainer()

# train_dataloader, val_dataloader = b.get_train_val_dataloader()
# trainer.run_train(data_iter=train_dataloader,
#                   validation_iter=val_dataloader,
#                   model_network=b.get_network(),
#                   loss_function=b.get_loss_function(),
#                   optimizer=b.get_optimiser(), model_dir=modeldir, pos_label=b.get_pos_label_index())


2020-07-03 10:04:51,355 - transformers.file_utils - INFO - PyTorch version 1.4.0 available.
2020-07-03 10:04:52,247 - transformers.tokenization_utils - INFO - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ec2-user/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2020-07-03 10:04:54,810 - transformers.tokenization_utils - INFO - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ec2-user/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2020-07-03 10:04:56,390 - transformers.configuration_utils - INFO - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ec2-user/.cache/torch

KeyboardInterrupt: 