##### Prerequisites

In [None]:
%%capture 

!pip install torch==1.12.1+cu113
!pip install transformers==4.26.1
!pip install datasets==2.9.0
!pip install pandas==1.5.3

#### Imports 

In [7]:
from transformers import AutoTokenizer
from tqdm import tqdm
import pandas as pd
import transformers 
import datasets
import logging
import pandas
import torch

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [5]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using pandas version: {pandas.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')

[Using transformers version: 4.26.1]
[Using datasets version: 2.9.0]
[Using pandas version: 1.5.3]
[Using torch version: 1.12.1+cu113]


#### Load GPT2 tokenizer 

In [8]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6B')
logger.info(tokenizer)

Downloading (…)okenizer_config.json: 100%|██████████| 619/619 [00:00<00:00, 88.4kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 96.2MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 77.0MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.37M/1.37M [00:00<00:00, 158MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 4.04k/4.04k [00:00<00:00, 1.61MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 357/357 [00:00<00:00, 165kB/s]
GPT2TokenizerFast(name_or_path='EleutherAI/gpt-j-6B', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, no

In [9]:
vocab = tokenizer.get_vocab()
len(vocab)

50400

##### Add special tokens 

In [10]:
special_tokens = {
    'bos_token': '<|startoftext|>',
    'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']
}

In [11]:
_ = tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
len(vocab)

50405

In [12]:
logger.info(tokenizer)

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-j-6B', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': ['<|speaker-1|>', '<|speaker-2|>', '<|pad|>', '<|mask|>']})
