##### Prerequisities 

Make sure you choose `PyTorch 1.6 Python 3.6 GPU Optimized` for **Kernel** above

In [1]:
%%capture

!pip install transformers==4.6.1
!pip install datasets[s3]==1.6.2

Upgrade ipywidgets for `datasets` library and restart kernel

In [2]:
%%capture

import IPython
!conda install -c conda-forge ipywidgets -y
IPython.Application.instance().kernel.do_shutdown(True)

#### Imports 

In [1]:
from sagemaker import get_execution_role, Session
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import pandas as pd

In [2]:
import transformers 
import sagemaker
import datasets
import logging
import torch
import sys

##### Setup Logger

In [3]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')
logger.info(f'[Using Torch: {torch.__version__}]')

[Using SageMaker: 2.44.0]
[Using Transformers: 4.6.1]
[Using Datasets: 1.6.2]
[Using Torch: 1.6.0]


#### Essentials

In [5]:
session = Session()
role = get_execution_role()
bucket = session.default_bucket()
s3 = S3FileSystem() 

In [6]:
logger.info(f'Default Bucket = {bucket}')
logger.info(f'Sagemaker Role ARN ={role}')
logger.info(f'Sagemaker Session Region = {session.boto_region_name}')

Default Bucket = sagemaker-us-east-1-119174016168
Sagemaker Role ARN =arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211013T112826
Sagemaker Session Region = us-east-1


#### Explore data set

In [7]:
pd.set_option('max_colwidth', 400)

In [8]:
train_df = pd.read_csv('./data/train.csv')
train_df.head(5)

Unnamed: 0,label,text
0,1,@player112345 Good Morrow! How are you this fair morn?
1,0,Aaron just left to swap out the fridge in his mother's condo. He'll be hauling two fridges up/down 2 flights of stairs. With no help.
2,1,#ahbl princess jared almost ready for his closeup
3,0,@paigeebaby awwww come see it with me and ill watch it for you haha so ill see it twice
4,0,@ETown_Lesley OMG I KNOW! I just can't get over all the children. 8 of them. Will be so complicated... so sad


In [9]:
train_df.shape

(990000, 2)

In [10]:
test_df = pd.read_csv('./data/test.csv')
test_df.shape

(10000, 2)

In [11]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Tokenize

In [12]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)

In [13]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [14]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000, num_proc=10)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000, num_proc=10)























In [15]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [16]:
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 990000
})

In [17]:
type(train_dataset)

datasets.arrow_dataset.Dataset

Persist tokenized train and test sets from RAM to S3

In [18]:
training_input_path = f's3://{bucket}/tweets/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

test_input_path = f's3://{bucket}/tweets/test'
test_dataset.save_to_disk(test_input_path,fs=s3)