##### Prerequisities 

Make sure you choose `PyTorch 1.6 Python 3.6 GPU Optimized` for **Kernel** above

In [None]:
%%capture

!pip install transformers==4.6.1
!pip install datasets[s3]==1.6.2

Upgrade ipywidgets for `datasets` library and restart kernel

In [None]:
%%capture

import IPython
!conda install -c conda-forge ipywidgets -y
IPython.Application.instance().kernel.do_shutdown(True)

#### Imports 

In [None]:
from sagemaker import get_execution_role, Session
from datasets.filesystems import S3FileSystem
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import pandas as pd

In [None]:
import transformers 
import sagemaker
import datasets
import logging
import torch
import sys

##### Setup Logger

In [None]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [None]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')
logger.info(f'[Using Torch: {torch.__version__}]')

#### Essentials

In [None]:
session = Session()
role = get_execution_role()
bucket = session.default_bucket()
s3 = S3FileSystem() 

In [None]:
logger.info(f'Default Bucket = {bucket}')
logger.info(f'Sagemaker Role ARN ={role}')
logger.info(f'Sagemaker Session Region = {session.boto_region_name}')

#### Explore data set

In [None]:
pd.set_option('max_colwidth', 400)

In [None]:
train_df = pd.read_csv('./data/train.csv')
train_df.head(5)

In [None]:
train_df.shape

In [None]:
test_df = pd.read_csv('./data/test.csv')
test_df.shape

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Tokenize

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000, num_proc=10)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000, num_proc=10)

In [None]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
train_dataset

In [None]:
type(train_dataset)

Persist tokenized train and test sets from RAM to S3

In [None]:
training_input_path = f's3://{bucket}/imdb/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

test_input_path = f's3://{bucket}/imdb/test'
test_dataset.save_to_disk(test_input_path,fs=s3)