In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import torch
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 2080 Ti'

In [3]:
# import wandb

# # start a new wandb run to track this script
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="unigram-bert-project-demo",
    
#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 2e-5,
#     "weight_decay" : 0.01,
#     "architecture": "unigram-bert-base",
#     "dataset": "bdnews24",
#     "epochs": 6,
#     "batch size": 4,
#     }
# )


In [4]:
from datasets import load_dataset

hindi_live_dataset = load_dataset("text", data_files="../datasets/Hindi LiveHindustan.txt", split="train")

Found cached dataset text (/home/ashahri1/.cache/huggingface/datasets/text/default-21edc6bfd2f3f8f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


In [5]:
hindi_live_dataset.set_format("pandas")

In [6]:
hindi_live_df = hindi_live_dataset[:]

In [7]:
hindi_live_df_lens = hindi_live_df['text'].str.len()

In [8]:
max(hindi_live_df_lens)

117

In [9]:
# pretraining_df_lens = pretraining_df['text'].str.len()
count = hindi_live_df['text'].str.split().apply(len).value_counts()

In [10]:
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [11]:
count

text
10 words:    24880
11 words:    27131
12 words:    28455
13 words:    29475
14 words:    29216
15 words:    28239
4 words:      2074
5 words:      4886
6 words:      8374
7 words:     12629
8 words:     17283
9 words:     21205
Name: count, dtype: int64

In [12]:
hindi_live_dataset.reset_format()

In [13]:
hindi_live_dataset

Dataset({
    features: ['text'],
    num_rows: 233847
})

In [14]:
hindi_live_dataset[:5]

{'text': ['रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।',
  'रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।',
  'इसका मुकाबला हुंडई क्रेटा से होगा ।',
  'इससे बेहतर सिंगल सीटर और सेल्फ बैलेंसिंग बाइक आपने पहले कभी नहीं देखी होगी ।',
  'इस बाइक को अत्याधुनिक होवरबोर्ड को ध्यान में रखकर डिजाइन किया गया है ।']}

In [15]:
def get_training_corpus():
    for i in range(0, len(hindi_live_dataset), 1000):
        yield hindi_live_dataset[i : i+1000]["text"]

In [16]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [17]:
tokenizer = Tokenizer(models.Unigram())

In [18]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKC(),
#         normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)

In [21]:
print(tokenizer.normalizer.normalize_str('रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।'))

रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।


In [22]:
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)

In [23]:
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

In [24]:
tokenizer.pre_tokenizer.pre_tokenize_str('रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।')

[('▁रेनो', (0, 4)),
 ('▁ने', (4, 7)),
 ('▁कैप्चर', (7, 14)),
 ('▁एसयूवी', (14, 21)),
 ('▁को', (21, 24)),
 ('▁लॉन्च', (24, 30)),
 ('▁कर', (30, 33)),
 ('▁दिया', (33, 38)),
 ('▁है', (38, 41)),
 ('▁।', (41, 43))]

In [44]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.UnigramTrainer(
    vocab_size=30522, special_tokens=special_tokens, unk_token="[UNK]"
)

In [45]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)





In [46]:
encoding = tokenizer.encode('रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।')
print(encoding.tokens)

['[CLS]', '▁', 'रेनो', '▁ने', '▁कैप्चर', '▁एस', 'यूवी', '▁को', '▁लॉन्च', '▁कर', '▁दिया', '▁है', '▁', '।', '[SEP]']


In [47]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [55]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [56]:
encoding = tokenizer.encode('रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।',' रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।')
print(encoding.tokens)

['[CLS]', '▁', 'रेनो', '▁ने', '▁कैप्चर', '▁एस', 'यूवी', '▁को', '▁लॉन्च', '▁कर', '▁दिया', '▁है', '▁', '।', '[SEP]', '▁', 'रेनो', '▁कार', 'ों', '▁की', '▁रेंज', '▁में', '▁इसे', '▁डस्ट', 'र', '▁के', '▁ऊपर', '▁पोजिशन', '▁किया', '▁जाएग', 'ा', '▁', '।', '[SEP]']


In [58]:
encoding = tokenizer.encode('रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।',' रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।')
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', '▁', 'रेनो', '▁ने', '▁कैप्चर', '▁एस', 'यूवी', '▁को', '▁लॉन्च', '▁कर', '▁दिया', '▁है', '▁', '।', '[SEP]', '▁', 'रेनो', '▁कार', 'ों', '▁की', '▁रेंज', '▁में', '▁इसे', '▁डस्ट', 'र', '▁के', '▁ऊपर', '▁पोजिशन', '▁किया', '▁जाएग', 'ा', '▁', '।', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [59]:
tokenizer.decoder = decoders.Metaspace()

In [60]:
tokenizer.decode(encoding.ids)

'रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है । रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।'

In [61]:
tokenizer.enable_truncation(max_length=512)

In [63]:
tokenizer.save("unigram_tokenizer_hindi_live.json")

In [62]:
# tokenizer = Tokenizer.from_file("unigram_tokenizer_prothom_alo.json")

In [16]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    #tokenizer_object=tokenizer,
    tokenizer_file="unigram_tokenizer_hindi_live.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    return_special_tokens_mask = True,
    model_max_length = 512,
)

In [17]:
tokenizer.save_pretrained("unigram_tokenizers_hindi_live")

('unigram_tokenizers_hindi_live/tokenizer_config.json',
 'unigram_tokenizers_hindi_live/special_tokens_map.json',
 'unigram_tokenizers_hindi_live/tokenizer.json')

In [18]:
# from transformers import PreTrainedTokenizerFast

# tokenizer = PreTrainedTokenizerFast.from_pretrained("unigram_tokenizers_prothom_alo")

In [19]:
from transformers import BertConfig, BertForMaskedLM

# Building the config
config = BertConfig()

print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [20]:
unigram_bert_config = BertConfig(pad_token_id=tokenizer.pad_token_id)
print(unigram_bert_config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [21]:
unigram_bert_config.save_pretrained(save_directory="configs/unigram_bert_config/")

In [22]:
unigram_bert_config = BertConfig.from_pretrained("configs/unigram_bert_config/config.json")
# Building the model from the config
# Model is randomly initialized
model = BertForMaskedLM(unigram_bert_config)

In [23]:
unigram_bert_config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [24]:
# text = 'रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।'
text = 'रेनो ने कैप्चर एसयूवी को लॉन्च [MASK] दिया है ।'

In [25]:
tokenizer.tokenize(text)

['▁',
 'रेनो',
 '▁ने',
 '▁कैप्चर',
 '▁एस',
 'यूवी',
 '▁को',
 '▁लॉन्च',
 '▁',
 '[MASK]',
 '▁दिया',
 '▁है',
 '▁',
 '।']

In [26]:
import torch 

inputs = tokenizer(text, return_tensors="pt")
# inputs.to("cuda")

token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> रेनो ने कैप्चर एसयूवी को लॉन्च सेलेबेट्री दिया है ।'
'>>> रेनो ने कैप्चर एसयूवी को लॉन्च बौर दिया है ।'
'>>> रेनो ने कैप्चर एसयूवी को लॉन्च  दिया है ।'
'>>> रेनो ने कैप्चर एसयूवी को लॉन्च देवता दिया है ।'
'>>> रेनो ने कैप्चर एसयूवी को लॉन्च सरहद दिया है ।'


In [27]:
def tokenize_function(examples):
    result = tokenizer(examples["text"], padding="max_length", max_length=120, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = hindi_live_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Loading cached processed dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-21edc6bfd2f3f8f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-c0fb07c92c9563a5.arrow


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 233847
})

In [28]:
tokenized_datasets = tokenized_datasets.remove_columns("token_type_ids")

In [29]:
tokenized_datasets[0]

{'input_ids': [2,
  5,
  6496,
  27,
  7479,
  1629,
  4041,
  16,
  883,
  34,
  106,
  8,
  5,
  6,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [30]:
def group_texts(examples):
    # Create a new labels column
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [31]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-21edc6bfd2f3f8f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-1ff5acb887296495.arrow


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 233847
})

In [32]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'[CLS] रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [33]:
tokenizer.decode(lm_datasets[1]["labels"])

'[CLS] रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [34]:
tokenizer.pad_token_id

1

In [35]:
import collections
import numpy as np
np.random
from transformers import default_data_collator

wwm_probability = 0.15


def bangla_data_collator(features):
    for feature in features:
#         word_ids = feature.pop("word_ids")

#         # Create a map between words and corresponding token indices
#         mapping = collections.defaultdict(list)
#         current_word_index = -1
#         current_word = None
#         for idx, word_id in enumerate(word_ids):
#             if word_id is not None:
#                 if word_id != current_word:
#                     current_word = word_id
#                     current_word_index += 1
#                 mapping[current_word_index].append(idx)

        # Randomly mask words
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        mask = np.random.binomial(1, wwm_probability, (len(input_ids),))
        special_tokens =  [tokenizer.unk_token_id, tokenizer.pad_token_id, tokenizer.cls_token_id, \
                           tokenizer.sep_token_id, tokenizer.mask_token_id]
        
        new_labels = [-100] * len(labels)
        for idx in np.where(mask)[0]:
#             word_id = word_id.item()
#             print(word_id)
#             for idx in mapping[word_id]:
#             if word_ids[idx] is not None:
            if input_ids[idx] not in special_tokens:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
            feature["labels"] = new_labels
        
    return default_data_collator(features)

In [36]:
from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
lm_datasets = lm_datasets.remove_columns(["word_ids"])
data_collator = bangla_data_collator

In [37]:
samples = [lm_datasets[i] for i in range(3)]
# for sample in samples:
#     _ = sample.pop("word_ids")

for chunk in bangla_data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS][MASK]रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

'>>> [CLS] रेनो[MASK]ों की रेंज में इसे डस्टर के ऊपर पोजिशन किया[MASK]ा ।[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD

In [38]:
hindi_live_dataset[:3]

{'text': ['रेनो ने कैप्चर एसयूवी को लॉन्च कर दिया है ।',
  'रेनो कारों की रेंज में इसे डस्टर के ऊपर पोजिशन किया जाएगा ।',
  'इसका मुकाबला हुंडई क्रेटा से होगा ।']}

In [39]:
samples = [lm_datasets[i] for i in range(1)]

chunk = data_collator(samples)
print(chunk["input_ids"])
print(chunk["labels"])

tensor([[   2,    5, 6496,   27, 7479, 1629, 4041,   16,    4,   34,  106,    8,
            5,    6,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1]])
tensor([[-100, -100, -100, -100, -100, -100, -100, -100,  883, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100,

In [40]:
# train_size = 10_000
# test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=0.8, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-21edc6bfd2f3f8f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-6bfb4ff43582e1e5.arrow and /home/ashahri1/.cache/huggingface/datasets/text/default-21edc6bfd2f3f8f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-4638031af91bf4e2.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 187077
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 46770
    })
})

In [43]:
for idx, sample in enumerate(downsampled_dataset["train"]["input_ids"][:3]):
    print(f"'>>> Article {idx} length: {len(sample)}'")

'>>> Article 0 length: 120'
'>>> Article 1 length: 120'
'>>> Article 2 length: 120'


In [61]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [62]:
# from __future__ import division

In [63]:
from transformers import TrainingArguments

batch_size = 64
eval_batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size


training_args = TrainingArguments(
    num_train_epochs=3,
    report_to = None,
    output_dir="models/unigram/bert-base-pretrained-hindi_live",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    #push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
    save_strategy = "epoch",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [64]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using amp half precision backend


In [65]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 46770
  Batch size = 64


>>> Perplexity: 23.12


In [56]:
trainer.train()

***** Running training *****
  Num examples = 187077
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 8772


Epoch,Training Loss,Validation Loss
1,2.4885,3.312951
2,2.4959,3.235848
3,2.63,3.154999


***** Running Evaluation *****
  Num examples = 46770
  Batch size = 64
Saving model checkpoint to models/unigram/bert-base-pretrained-hindi_live/checkpoint-2924
Configuration saved in models/unigram/bert-base-pretrained-hindi_live/checkpoint-2924/config.json
Model weights saved in models/unigram/bert-base-pretrained-hindi_live/checkpoint-2924/pytorch_model.bin
tokenizer config file saved in models/unigram/bert-base-pretrained-hindi_live/checkpoint-2924/tokenizer_config.json
Special tokens file saved in models/unigram/bert-base-pretrained-hindi_live/checkpoint-2924/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 46770
  Batch size = 64
Saving model checkpoint to models/unigram/bert-base-pretrained-hindi_live/checkpoint-5848
Configuration saved in models/unigram/bert-base-pretrained-hindi_live/checkpoint-5848/config.json
Model weights saved in models/unigram/bert-base-pretrained-hindi_live/checkpoint-5848/pytorch_model.bin
tokenizer config file saved in models/un

TrainOutput(global_step=8772, training_loss=2.5382719647650624, metrics={'train_runtime': 2430.4739, 'train_samples_per_second': 230.914, 'train_steps_per_second': 3.609, 'total_flos': 3.4621570381068e+16, 'train_loss': 2.5382719647650624, 'epoch': 3.0})

In [57]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 46770
  Batch size = 64


>>> Perplexity: 23.27


In [58]:
trainer.save_model()

Saving model checkpoint to models/unigram/bert-base-pretrained-hindi_live
Configuration saved in models/unigram/bert-base-pretrained-hindi_live/config.json
Model weights saved in models/unigram/bert-base-pretrained-hindi_live/pytorch_model.bin
tokenizer config file saved in models/unigram/bert-base-pretrained-hindi_live/tokenizer_config.json
Special tokens file saved in models/unigram/bert-base-pretrained-hindi_live/special_tokens_map.json


In [128]:
# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▇▅▂█▇█▅▁█▇█
eval/runtime,▇▄█▂▁▂▄▄▁▁▃
eval/samples_per_second,▂▄▁▆█▇▅▄█▇▆
eval/steps_per_second,▂▄▁▆█▇▅▄█▇▆
train/epoch,█▁▁▅▅████▁▁▅▅████
train/global_step,█▁▃▃▆▆████▁▃▃▆▆████
train/learning_rate,█▄▁█▄▁
train/loss,▅▇█▁▅▇
train/total_flos,▁▁
train/train_loss,█▁

0,1
eval/loss,6.92882
eval/runtime,4.4849
eval/samples_per_second,222.968
eval/steps_per_second,55.742
train/epoch,3.0
train/global_step,7500.0
train/learning_rate,0.0
train/loss,6.4857
train/total_flos,1974036096000000.0
train/train_loss,6.33972


In [59]:
model = BertForMaskedLM.from_pretrained("models/unigram/bert-base-pretrained-hindi_live")

loading configuration file models/unigram/bert-base-pretrained-hindi_live/config.json
Model config BertConfig {
  "_name_or_path": "models/unigram/bert-base-pretrained-hindi_live",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file models/unigram/bert-base-pretrained-hindi_live/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the mode

In [60]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/unigram/bert-base-pretrained-hindi_live",)

Didn't find file models/unigram/bert-base-pretrained-hindi_live/added_tokens.json. We won't load it.
loading file None
loading file models/unigram/bert-base-pretrained-hindi_live/special_tokens_map.json
loading file models/unigram/bert-base-pretrained-hindi_live/tokenizer_config.json
loading file models/unigram/bert-base-pretrained-hindi_live/tokenizer.json


In [41]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [43]:
# downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Loading cached processed dataset at /home/ashahri1/.cache/huggingface/datasets/text/default-21edc6bfd2f3f8f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-179a99cbb4b7681f.arrow


In [69]:
samples = [eval_dataset[i] for i in range(1)]

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS][MASK] रॉक[MASK] के बाद[MASK][MASK][MASK] बैठे [MASK][SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'


In [54]:
tokenizer.encode("[MASK]")

[2, 4, 3]

In [55]:
idx=3

In [56]:
tokenizer.decode(eval_dataset[idx]['input_ids'])

In [57]:
downsampled_dataset["test"][idx]

In [69]:
eval_dataset[idx]['input_ids']

[2,
 2585,
 9,
 18561,
 143,
 4,
 4,
 4,
 5,
 6,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [70]:
indexes = [i for i, x in enumerate(eval_dataset[idx]['input_ids']) if x == 4]
indexes

[5, 6, 7]

In [60]:
for idx, _ in enumerate(eval_dataset['input_ids']):
    indexes = [i for i, x in enumerate(eval_dataset[idx]['input_ids']) if x == 4]
    references = [i for i, x in enumerate(eval_dataset[idx]['labels']) if x != -100]
    if indexes != references:
        print(idx)

In [71]:
references = [i for i, x in enumerate(eval_dataset[idx]['labels']) if x != -100]
references

[5, 6, 7]

In [73]:
eval_dataset[idx]['labels'] 

[-100,
 -100,
 -100,
 -100,
 -100,
 10254,
 10,
 15591,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 4
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
downsampled_dataset["train"]

In [None]:
downsampled_dataset["test"]

In [None]:
from transformers import BertConfig, BertForMaskedLM

# Building the config
config = BertConfig()

# Building the model from the config
# Model is randomly initialized
model = BertForMaskedLM(config)

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("models/bert-base-pretrained-bdnews24-static")

In [None]:
model = BertForMaskedLM.from_pretrained("models/bert-base-pretrained-bdnews24-static")

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# from accelerate import Accelerator

# accelerator = Accelerator()
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
output_dir = "models/bert-base-pretrained-bdnews24-static"

In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        #accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        #losses.append(accelerator.gather(loss.repeat(batch_size)))
        losses.append(loss.repeat(batch_size))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Loss: {torch.mean(losses)}")
    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    #accelerator.wait_for_everyone()
    #unwrapped_model = accelerator.unwrap_model(model)
    #unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    #if accelerator.is_main_process:
        #tokenizer.save_pretrained(output_dir)

In [None]:
# from accelerate import notebook_launcher

# notebook_launcher(training_function(model, optimizer, train_dataloader, eval_dataloader), num_processes=1)

In [71]:
import torch
torch.cuda.empty_cache()