In [46]:
from transformers import AutoTokenizer

from transformers import BertTokenizerFast, LongformerTokenizerFast, BigBirdTokenizerFast
from datasets import DatasetDict, Features, Value, ClassLabel, load_dataset, load_from_disk

In [7]:
tokenizer_bert = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=4096)

tokenizer_bert

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [10]:
tokenizer_bigbird = BigBirdTokenizerFast.from_pretrained('google/bigbird-roberta-base', max_length=4096)

tokenizer_bigbird

PreTrainedTokenizerFast(name_or_path='google/bigbird-roberta-base', vocab_size=50358, model_max_len=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [18]:
tokenizer_longformer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length=4096)

tokenizer_longformer

PreTrainedTokenizerFast(name_or_path='allenai/longformer-base-4096', vocab_size=50265, model_max_len=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [27]:
def get_dataset(data_name='refined_patents', seed=42, partitions={'train':0.8, 'test_validation':0.2}):
    print("Loading dataset from csv...")
    features = Features({   'patent_id': Value('string'),
                            'text': Value('string'),
                            'labels': ClassLabel(names=["A","B","C","D","E","F","G","H"]),
                            'ipc_class': Value('string'),
                            'subclass': Value('string'),
                        })
                        
    data_files = '../../data/'+data_name+'/chunks/*.csv' # add preprocess forlder

    dataset = load_dataset('csv', data_files=data_files, features=features, cache_dir='data/'+data_name+'/cache')
    dataset = dataset['train'].train_test_split(test_size=partitions['test_validation'], shuffle=True, seed=seed)
    test_val = dataset['test'].train_test_split(test_size=0.5, shuffle=True, seed=seed)

    dataset = DatasetDict({
        'train': dataset['train'],
        'validation': test_val['train'],
        'test': test_val['test']
    })
    return dataset

In [36]:
def get_training_corpus(dataset):
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["text"]

In [None]:
dataset = get_dataset()

In [45]:
training_corpus = get_training_corpus(dataset['train'])
trained_tokenizer_bert = tokenizer_bert.train_new_from_iterator(training_corpus, 50000)
trained_tokenizer_bert.save_pretrained("../../tokenizers/bert_trained_on_patent_data")

('../../tokenizers/bert_trained_on_patent_data\\tokenizer_config.json',
 '../../tokenizers/bert_trained_on_patent_data\\special_tokens_map.json',
 '../../tokenizers/bert_trained_on_patent_data\\vocab.txt',
 '../../tokenizers/bert_trained_on_patent_data\\added_tokens.json',
 '../../tokenizers/bert_trained_on_patent_data\\tokenizer.json')

In [47]:
loaded_tokenizer = AutoTokenizer.from_pretrained("../../tokenizers/bert_trained_on_patent_data")