# Training a new tokenizer from an old one

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("ccdv/pubmed-summarization")

In [4]:
raw_datasets["train"]

Dataset({
    features: ['article', 'abstract'],
    num_rows: 119924
})

In [5]:
print(raw_datasets["train"][119923]['article'])

cerebral palsy is a nonprogressive central nervous system disorder that results in physical impairments and functional limitations that change as the children grow older . among a large number of instruments [ 24 ] , for measuring the physical ability of children with cp , the gross motor function classification system ( gmfcs ) introduced by palisano et al . in 1997   has been widely applied in clinical and research settings . 
 the gmfcs is a five - level classification system that identifies abilities and functional limitations , based on the need of assistive devices of the cerebral palsy child , during self - initiated movements , such as walking and sitting . 
 the system application is quick and easy and it gives a brief description of which level the child resembles based on his / her current gross motor function . 
 the reliability and validity of the gmfcs in differentiating cerebral palsy children with different functional levels have been reported . 
 similarly , the stabil

In [6]:
training_corpus = (
    raw_datasets["train"][i : i + 1000]["article"]
    for i in range(0, len(raw_datasets["train"]), 1000)
)

In [None]:
# def get_training_corpus():
#     dataset = raw_datasets["train"]
#     for start_idx in range(0, len(dataset), 1000):
#         samples = dataset[start_idx : start_idx + 1000]
#         yield samples["whole_func_string"]

In [7]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")



In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=50256)

In [9]:
example = 'cerebral palsy is a nonprogressive central nervous system disorder that results in physical impairments and functional limitations that change as the children grow older .'
tokens = tokenizer.tokenize(example)
tokens

['cerebral',
 'Ġpalsy',
 'Ġis',
 'Ġa',
 'Ġnonprog',
 'ressive',
 'Ġcentral',
 'Ġnervous',
 'Ġsystem',
 'Ġdisorder',
 'Ġthat',
 'Ġresults',
 'Ġin',
 'Ġphysical',
 'Ġimpairments',
 'Ġand',
 'Ġfunctional',
 'Ġlimitations',
 'Ġthat',
 'Ġchange',
 'Ġas',
 'Ġthe',
 'Ġchildren',
 'Ġgrow',
 'Ġolder',
 'Ġ.']

In [10]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

26
31


In [11]:
tokenizer.save_pretrained("Pubmed-tokenizer")

('Pubmed-tokenizer\\tokenizer_config.json',
 'Pubmed-tokenizer\\special_tokens_map.json',
 'Pubmed-tokenizer\\vocab.json',
 'Pubmed-tokenizer\\merges.txt',
 'Pubmed-tokenizer\\added_tokens.json',
 'Pubmed-tokenizer\\tokenizer.json')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
tokenizer.push_to_hub("pubmed-tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("atharvabhattad/pubmed-tokenizer")

This tokenizer generated can know be called the above mentioned way from hugging face