##### Prerequisities 

Make sure you choose `PyTorch 1.6 Python 3.6 GPU Optimized` for **Kernel** above.

In [None]:
%%capture

!pip install transformers==4.6.1
!pip install datasets[s3]==1.6.2

Upgrade ipywidgets for `datasets` library and restart kernel.

In [None]:
%%capture

import IPython
!conda install -c conda-forge ipywidgets -y
IPython.Application.instance().kernel.do_shutdown(True)

#### Imports 

In [9]:
from sagemaker import get_execution_role, Session
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import pandas as pd

In [2]:
import transformers 
import sagemaker
import datasets
import logging
import torch
import sys

##### Setup Logger

In [3]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')
logger.info(f'[Using Torch: {torch.__version__}]')

[Using SageMaker: 2.44.0]
[Using Transformers: 4.6.1]
[Using Datasets: 1.6.2]
[Using Torch: 1.6.0]


#### Essentials

In [5]:
session = Session()
role = get_execution_role()
bucket = session.default_bucket()
s3 = S3FileSystem() 

In [7]:
logger.info(f'Default Bucket = {bucket}')
logger.info(f'Sagemaker Role ARN ={role}')
logger.info(f'Sagemaker Session Region = {session.boto_region_name}')

Default Bucket = sagemaker-us-east-1-119174016168
Sagemaker Role ARN =arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211013T112826
Sagemaker Session Region = us-east-1


#### Explore data set

In [10]:
pd.set_option('max_colwidth', 400)

In [11]:
train_df = pd.read_csv('./data/train.csv')
train_df.head(5)

Unnamed: 0,label,text
0,1,"Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as ""Teachers"". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is ""Teachers"". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whol..."
1,1,"Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elec..."
2,1,"Brilliant over-acting by Lesley Ann Warren. Best dramatic hobo lady I have ever seen, and love scenes in clothes warehouse are second to none. The corn on face is a classic, as good as anything in Blazing Saddles. The take on lawyers is also superb. After being accused of being a turncoat, selling out his boss, and being dishonest the lawyer of Pepto Bolt shrugs indifferently ""I'm a lawyer"" he..."
3,1,"This is easily the most underrated film inn the Brooks cannon. Sure, its flawed. It does not give a realistic view of homelessness (unlike, say, how Citizen Kane gave a realistic view of lounge singers, or Titanic gave a realistic view of Italians YOU IDIOTS). Many of the jokes fall flat. But still, this film is very lovable in a way many comedies are not, and to pull that off in a story about..."
4,1,"This is not the typical Mel Brooks film. It was much less slapstick than most of his movies and actually had a plot that was followable. Leslie Ann Warren made the movie, she is such a fantastic, under-rated actress. There were some moments that could have been fleshed out a bit more, and some scenes that could probably have been cut to make the room to do so, but all in all, this is worth the..."


In [12]:
test_df = pd.read_csv('./data/test.csv')

In [13]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [14]:
train_dataset.shape

(25000, 2)

In [15]:
test_dataset.shape

(1000, 2)

##### Tokenize data

In [16]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=483.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [17]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [18]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [19]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [21]:
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 25000
})

In [22]:
type(train_dataset)

datasets.arrow_dataset.Dataset

Persist tokenized train and test sets from RAM to S3

In [20]:
training_input_path = f's3://{bucket}/imdb/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

test_input_path = f's3://{bucket}/imdb/test'
test_dataset.save_to_disk(test_input_path,fs=s3)