In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 2080 Ti'

In [3]:
import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wordpiece-bert-project-epoch24",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": 2e-5,
    "weight_decay" : 0.01,
    "architecture": "wordpiece-bert-base",
    "dataset": "bdnews24",
    "epochs": 24,
    "batch size": 64,
    }
)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mashahri1[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
from datasets import load_dataset

bdnews_dataset = load_dataset("text", data_files="datasets/Bangla BDnews.txt", split="train")

Found cached dataset text (/home/ashahri1/.cache/huggingface/datasets/text/default-b038fd0f029731c6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


In [5]:
bdnews_dataset.set_format("pandas")

In [6]:
bdnews_df = bdnews_dataset[:]

In [7]:
bdnews_df_lens = bdnews_df['text'].str.len()

In [8]:
max(bdnews_df_lens)

147

In [9]:
# pretraining_df_lens = pretraining_df['text'].str.len()
count = bdnews_df['text'].str.split().apply(len).value_counts()

In [10]:
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [11]:
count

text
10 words:    62453
11 words:    58017
12 words:    53345
13 words:    45989
14 words:    33958
15 words:    17864
4 words:     18477
5 words:     34027
6 words:     47866
7 words:     58572
8 words:     63809
9 words:     64354
Name: count, dtype: int64

In [12]:
bdnews_dataset.reset_format()

In [13]:
bdnews_dataset

Dataset({
    features: ['text'],
    num_rows: 558731
})

In [14]:
bdnews_dataset[:5]

{'text': ['এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে প্রচারণা ।',
  'সহযোগিতা করছে তথ্য ও যোগাযোগপ্রযুক্তি আইসিটি বিভাগ ।',
  'গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।',
  'গতকাল মঙ্গলবার এ কার্যক্রমের উদ্বোধন করেন বাংলাদেশ ব্যাংকের গভর্নর আতিউর রহমান ।',
  'এ কার্যক্রমের উদ্বোধনী অনুষ্ঠানে সভাপতিত্ব করেন অগ্রণী ব্যাংকের চেয়ারম্যান জায়েদ বখত ।']}

In [39]:
def get_training_corpus():
    for i in range(0, len(bdnews_dataset), 1000):
        yield bdnews_dataset[i : i+1000]["text"]

In [41]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [42]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [43]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFKC()]
)

In [44]:
print(tokenizer.normalizer.normalize_str('গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।'))

গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।


In [45]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [46]:
tokenizer.pre_tokenizer.pre_tokenize_str('গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।')

[('গ্রীন', (0, 5)),
 ('ব্যাংকিং', (6, 14)),
 ('বা', (15, 17)),
 ('পরিবেশবান্ধব', (18, 30)),
 ('ব্যাংকিং', (31, 39)),
 ('ও', (40, 41)),
 ('ছাদ', (42, 45)),
 ('বাগান', (46, 51)),
 ('কার্যক্রম', (52, 61)),
 ('শুরু', (62, 66)),
 ('করেছে', (67, 72)),
 ('রাষ্ট্রমালিকানাধীন', (73, 91)),
 ('অগ্রণী', (92, 98)),
 ('ব্যাংক', (99, 105)),
 ('।', (106, 107))]

In [47]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=30522, special_tokens=special_tokens
)

In [48]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [49]:
encoding = tokenizer.encode('গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।')
print(encoding.tokens)

['গ্র', '##ীন', 'ব্যাংকিং', 'বা', 'পরিবেশবান্ধব', 'ব্যাংকিং', 'ও', 'ছাদ', 'বাগান', 'কার্যক্রম', 'শুরু', 'করেছে', 'রাষ্ট্রমালিকানাধীন', 'অগ্রণী', 'ব্যাংক', '।']


In [50]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [17]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [56]:
encoding = tokenizer.encode('গ্রীন ব্যাংকিং বা পরিবেশবান্ধব ব্যাংকিং ও ছাদ বাগান কার্যক্রম শুরু করেছে রাষ্ট্রমালিকানাধীন অগ্রণী ব্যাংক ।')
print(encoding.token)

AttributeError: 'list' object has no attribute 'token'

In [19]:
encoding = tokenizer.encode("এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে প্রচারণা ।","সহযোগিতা করছে তথ্য ও যোগাযোগপ্রযুক্তি আইসিটি বিভাগ ।")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'এ', 'ছাড়া', 'শিক্ষাপ্রতিষ্ঠানে', '##ও', 'চলবে', 'প্রচারণা', '।', '[SEP]', 'সহযোগিতা', 'করছে', 'তথ্য', 'ও', 'যোগাযোগপ্রযুক্তি', 'আইসিটি', 'বিভাগ', '।', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [20]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [21]:
tokenizer.decode(encoding.ids)

'এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে প্রচারণা । সহযোগিতা করছে তথ্য ও যোগাযোগপ্রযুক্তি আইসিটি বিভাগ ।'

In [22]:
tokenizer.enable_truncation(max_length=512)

In [15]:
# tokenizer.save("wordpiece_tokenizer_bdnews.json")

In [24]:
# tokenizer = Tokenizer.from_file("unigram_tokenizer_bdnews.json")

In [19]:
# from transformers import BertTokenizerFast

# tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [16]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="wordpiece_tokenizer_bdnews.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    return_special_tokens_mask = True,
    model_max_length = 512,
)

In [17]:
# tokenizer.save_pretrained("wordpiece_tokenizer_bdnews")

In [18]:
# # from transformers import BertTokenizerFast
from transformers import PreTrainedTokenizerFast

# tokenizer = PreTrainedTokenizerFast.from_pretrained("wordpiece_tokenizer_bdnews")
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [19]:
from transformers import BertConfig, BertForMaskedLM


# Set a configuration for our RoBERTa model
unigram_bert_config = BertConfig(pad_token_id=tokenizer.pad_token_id)

# Building the model from the config
# Model is randomly initialized
model = BertForMaskedLM(unigram_bert_config)

print(unigram_bert_config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [20]:
text = "এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে [MASK] ।"

In [21]:
tokenizer.tokenize(text)

['এ', 'ছাড়া', 'শিক্ষাপ্রতিষ্ঠানে', '##ও', 'চলবে', '[MASK]', '।']

In [22]:
import torch 

inputs = tokenizer(text, return_tensors="pt")
# inputs.to("cuda")

token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# # Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে ওয়েস্টার্ন ।'
'>>> এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে ##াখুঁ ।'
'>>> এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে সামর্থ ।'
'>>> এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে ##ঙ্গাল ।'
'>>> এ ছাড়া শিক্ষাপ্রতিষ্ঠানেও চলবে গিব ।'


In [23]:
def tokenize_function(examples):
    result = tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = bdnews_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Loading cached processed dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-b038fd0f029731c6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-4bace3dbe0226210.arrow


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 558731
})

In [24]:
tokenized_datasets.remove_columns("token_type_ids")

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 558731
})

In [25]:
tokenized_datasets[0]

{'input_ids': [2,
  42,
  654,
  11486,
  108,
  3271,
  4057,
  31,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [26]:
def group_texts(examples):
    # Create a new labels column
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [27]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-b038fd0f029731c6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-71b66ccba7949897.arrow


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 558731
})

In [28]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'[CLS] সহযোগিতা করছে তথ্য ও যোগাযোগপ্রযুক্তি আইসিটি বিভাগ । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [29]:
tokenizer.decode(lm_datasets[1]["labels"])

'[CLS] সহযোগিতা করছে তথ্য ও যোগাযোগপ্রযুক্তি আইসিটি বিভাগ । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [30]:
tokenizer.mask_token_id

4

In [31]:
import collections
import numpy as np
np.random
from transformers import default_data_collator

wwm_probability = 0.15


def bangla_data_collator(features):
    for feature in features:
#         word_ids = feature.pop("word_ids")

#         # Create a map between words and corresponding token indices
#         mapping = collections.defaultdict(list)
#         current_word_index = -1
#         current_word = None
#         for idx, word_id in enumerate(word_ids):
#             if word_id is not None:
#                 if word_id != current_word:
#                     current_word = word_id
#                     current_word_index += 1
#                 mapping[current_word_index].append(idx)

        # Randomly mask words
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        mask = np.random.binomial(1, wwm_probability, (len(input_ids),))
        special_tokens =  [tokenizer.unk_token_id, tokenizer.pad_token_id, tokenizer.cls_token_id, \
                           tokenizer.sep_token_id, tokenizer.mask_token_id]
        
        new_labels = [-100] * len(labels)
        for idx in np.where(mask)[0]:
#             word_id = word_id.item()
#             print(word_id)
#             for idx in mapping[word_id]:
#             if word_ids[idx] is not None:
            if input_ids[idx] not in special_tokens:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
            feature["labels"] = new_labels
        
    return default_data_collator(features)

In [32]:
from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
lm_datasets = lm_datasets.remove_columns(["word_ids"])
data_collator = bangla_data_collator

In [33]:
samples = [lm_datasets[i] for i in range(1)]
# for sample in samples:
#     _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] এ [MASK] শিক্ষাপ্রতিষ্ঠানে [MASK] [MASK] প্রচারণা । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'


In [34]:
samples = [lm_datasets[i] for i in range(1)]

chunk = data_collator(samples)
print(chunk["input_ids"])
print(chunk["labels"])

tensor([[    2,     4,   654, 11486,   108,  3271,  4057,    31,     3,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [35]:
# train_size = 10_000
# test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=0.8, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-b038fd0f029731c6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-cf033cc8a5762aa3.arrow and /home/ashahri1/.cache/huggingface/datasets/text/default-b038fd0f029731c6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-1e55f565b89104d7.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 446984
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 111747
    })
})

In [36]:
for idx, sample in enumerate(downsampled_dataset["train"]["input_ids"][:3]):
    print(f"'>>> Article {idx} length: {len(sample)}'")

'>>> Article 0 length: 128'
'>>> Article 1 length: 128'
'>>> Article 2 length: 128'


In [37]:
#  disable weights and biases logging
# import os
# os.environ["WANDB_DISABLED"] = "true"

In [44]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size


training_args = TrainingArguments(
    num_train_epochs = 6,
    #report_to = None,
    output_dir="models/wordpiece/bert-base-pretrained-bdnews24",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
    save_strategy = "epoch",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [45]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using amp half precision backend


In [47]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {eval_results['eval_loss']:.2f}")
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 111747
  Batch size = 64


>>> Perplexity: 4.81
>>> Perplexity: 122.46


In [48]:
trainer.train()

***** Running training *****
  Num examples = 446984
  Num Epochs = 6
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 41910


Epoch,Training Loss,Validation Loss
1,4.9492,4.886817
2,4.8005,4.744913
3,4.6806,4.659183
4,4.5829,4.590711
5,4.5089,4.542928
5,4.5089,4.519495


***** Running Evaluation *****
  Num examples = 111747
  Batch size = 64
Saving model checkpoint to models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-6985
Configuration saved in models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-6985/config.json
Model weights saved in models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-6985/pytorch_model.bin
tokenizer config file saved in models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-6985/tokenizer_config.json
Special tokens file saved in models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-6985/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 111747
  Batch size = 64
Saving model checkpoint to models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-13970
Configuration saved in models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-13970/config.json
Model weights saved in models/wordpiece/bert-base-pretrained-bdnews24/checkpoint-13970/pytorch_model.bin
tokenizer config file saved in mode

KeyboardInterrupt: 

In [54]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 111747
  Batch size = 64


>>> Perplexity: 91.79


In [49]:
trainer.save_model()

Saving model checkpoint to models/wordpiece/bert-base-pretrained-bdnews24
Configuration saved in models/wordpiece/bert-base-pretrained-bdnews24/config.json
Model weights saved in models/wordpiece/bert-base-pretrained-bdnews24/pytorch_model.bin
tokenizer config file saved in models/wordpiece/bert-base-pretrained-bdnews24/tokenizer_config.json
Special tokens file saved in models/wordpiece/bert-base-pretrained-bdnews24/special_tokens_map.json


In [56]:
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

0,1
eval/loss,▅█▆▄▃▁▁▁
eval/runtime,█▅▅▅▅▄▅▁
eval/samples_per_second,▁▄▄▄▄▄▄█
eval/steps_per_second,▁▄▄▄▄▄▄█
train/epoch,▁▁▂▂▄▄▅▅▇▇████
train/global_step,▁▂▂▃▃▄▅▆▆▇▇████
train/learning_rate,█▇▅▄▂▁
train/loss,▇▁▁▂▄█
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,4.60742
eval/runtime,129.6655
eval/samples_per_second,861.81
eval/steps_per_second,13.473
train/epoch,6.0
train/global_step,41910.0
train/learning_rate,0.0
train/loss,4.4858
train/total_flos,1.764725100668928e+17
train/train_loss,4.41345


In [50]:
model = BertForMaskedLM.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

loading configuration file models/wordpiece/bert-base-pretrained-bdnews24/config.json
Model config BertConfig {
  "_name_or_path": "models/wordpiece/bert-base-pretrained-bdnews24",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file models/wordpiece/bert-base-pretrained-bdnews24/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the mode

In [51]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

Didn't find file models/wordpiece/bert-base-pretrained-bdnews24/added_tokens.json. We won't load it.
loading file None
loading file models/wordpiece/bert-base-pretrained-bdnews24/special_tokens_map.json
loading file models/wordpiece/bert-base-pretrained-bdnews24/tokenizer_config.json
loading file models/wordpiece/bert-base-pretrained-bdnews24/tokenizer.json


In [85]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [87]:
# downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
        'masked_token_type_ids' : 'token_type_ids'
    }
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [88]:
eval_dataset[0]

{'input_ids': [2,
  4320,
  468,
  4,
  9912,
  31,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [90]:
samples = [eval_dataset[i] for i in range(10)]

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] আসল নাম [MASK] রায়হান । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] তবে [MASK] [MASK] [MASK] সমঝোতা না হওয়ায় ওই [MASK] কোনো সিদ্ধান্ত ছাড়াই শেষ [MASK] [MASK] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [95]:
for idx, _ in enumerate(eval_dataset['input_ids']):
    indexes = [i for i, x in enumerate(eval_dataset[idx]['input_ids']) if x == 4]
    references = [i for i, x in enumerate(eval_dataset[idx]['labels']) if x != -100]
    if indexes != references:
        print(idx)

In [76]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 4
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [77]:
downsampled_dataset["train"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [78]:
downsampled_dataset["test"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [141]:
# from transformers import BertConfig, BertForMaskedLM

# # Building the config
# config = BertConfig()

# # Building the model from the config
# # Model is randomly initialized
# model = BertForMaskedLM(config)

In [44]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [45]:
model = BertForMaskedLM.from_pretrained("models/wordpiece/bert-base-pretrained-bdnews24")

In [83]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [84]:
# from accelerate import Accelerator

# accelerator = Accelerator()
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )

In [85]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [87]:
output_dir = "models/wordpiece/bert-base-pretrained-bdnews24-static"

In [88]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        #accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        #losses.append(accelerator.gather(loss.repeat(batch_size)))
        losses.append(loss.repeat(batch_size))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Loss: {torch.mean(losses)}")
    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    #accelerator.wait_for_everyone()
    #unwrapped_model = accelerator.unwrap_model(model)
    #unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    #if accelerator.is_main_process:
        #tokenizer.save_pretrained(output_dir)

  0%|          | 0/2500 [00:00<?, ?it/s]

Configuration saved in models/wordpiece/bert-base-pretrained-bdnews24-static\config.json


>>> Epoch 0: Loss: nan
>>> Epoch 0: Perplexity: nan


Model weights saved in models/wordpiece/bert-base-pretrained-bdnews24-static\pytorch_model.bin
tokenizer config file saved in models/wordpiece/bert-base-pretrained-bdnews24-static\tokenizer_config.json
Special tokens file saved in models/wordpiece/bert-base-pretrained-bdnews24-static\special_tokens_map.json


In [94]:
import torch
torch.cuda.empty_cache()