In [1]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, \
CharacterBertForPreTraining, CharacterBertConfig, CharacterBertTokenizer
import torch
import os

In [2]:
model_config = CharacterBertConfig.from_pretrained('../data/character-bert')
model = CharacterBertForPreTraining(model_config)

In [3]:
# Load the actual checkpoint file
output_directory = "long-text/model"

checkpoint = torch.load(
    output_directory, map_location="cpu"
)

In [4]:
model.load_state_dict(checkpoint['model'], strict=True)

<All keys matched successfully>

In [5]:
model.save_pretrained('character-bert')

In [6]:
from transformers import set_seed

# set_seed(42)
set_seed(30)

In [7]:
#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=6)  # binary classification
model = BertForSequenceClassification(config=config)

In [8]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

Embedding(30522, 768, padding_idx=0)

In [9]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Question Classification\character-bert")
model.bert = character_bert_model

Some weights of the model checkpoint at E:\Documents\Character Bert\Question Classification\character-bert were not used when initializing CharacterBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

CharacterCnn(
  (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
  (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
  (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
  (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
  (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
  (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
  (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
  (_highways): Highway(
    (_layers): ModuleList(
      (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
    )
  )
  (_projection): Linear(in_features=2048, out_features=768, bias=True)
)

In [11]:
from datasets import load_dataset

qa_dataset = load_dataset("csv", data_files="..\datasets\Bengali Question Classification.csv", split="train")

Found cached dataset csv (C:/Users/arifa/.cache/huggingface/datasets/csv/default-1513888b3417f440/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [12]:
qa_dataset

Dataset({
    features: ['Text', 'Label'],
    num_rows: 3333
})

In [13]:
qa_dataset.set_format("pandas")

In [14]:
# get label counts for both classes
label_counts = qa_dataset["Label"].value_counts()
num_labels = (len(label_counts.keys()))

In [15]:
num_labels

6

In [16]:
max_token_length = max(qa_dataset['Text'].str.len())
max_token_length

140

In [17]:
count = qa_dataset['Text'].str.split().apply(len).value_counts()

In [18]:
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [19]:
# count

In [20]:
qa_dataset.reset_format()

In [21]:
tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [22]:
# text="পাটের জেনেটিক ম্যাপ কোন দেশের বিজ্ঞানী আবিষ্কার করেছেন\
# ভারতীয় কৃষিজ পণ্যের অন্যতম আমদানিকারক দেশ কোনটি বিশ্বের সর্ববৃহত্ জনসংখ্যার দেশ কোনটি কোন দেশে খাদ্য ঘাটতি নেই \
# আমাদের দেশের হাইব্রিড ধান বীজের বড় জোগানদার কোন দেশ"

In [23]:
# len(text.split())

In [24]:
# len(tokenizer(text)["input_ids"])

In [25]:
model

BertForSequenceClassification(
  (bert): CharacterBertModel(
    (embeddings): CharacterBertEmbeddings(
      (word_embeddings): CharacterCnn(
        (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
        (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
        (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
        (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
        (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
        (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
        (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
        (_highways): Highway(
          (_layers): ModuleList(
            (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
          )
        )
        (_projection): Linear(in_features=2048, out_features=768, bias=True)
      )
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): Laye

In [26]:
def tokenize_function(example):
    return tokenizer(example["Text"])

In [27]:
from transformers import DataCollatorWithPadding

tokenized_dataset = qa_dataset.map(tokenize_function, batched=True, remove_columns=["Text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-1513888b3417f440\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-5495064c59d0879b.arrow


In [28]:
tokenized_dataset

Dataset({
    features: ['Label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3333
})

In [29]:
def assign_label(example):
    mapping = {'ABBREVIATION':0, 'DESCRIPTION':1, 'ENTITY':2, 'HUMAN':3, 'LOCATION':4, 'NUMERIC':5}
    example['Label'] = mapping[example['Label']]
    return example

In [30]:
tokenized_dataset = tokenized_dataset.map(assign_label)
tokenized_dataset = tokenized_dataset.rename_column("Label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-1513888b3417f440\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-4dc99239422fac02.arrow


['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [31]:
samples = [tokenized_dataset[i] for i in range(10)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] পাটের জেনেটিক ম্যাপ কোন দেশের বিজ্ঞানী আবিষ্কার করেছেন [SEP] [PAD]'

'>>> [CLS] ভারতীয় কৃষিজ পণ্যের অন্যতম আমদানিকারক দেশ কোনটি [SEP] [PAD] [PAD]'

'>>> [CLS] বিশ্বের সর্ববৃহত্ জনসংখ্যার দেশ কোনটি [SEP] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] কোন দেশে খাদ্য ঘাটতি নেই [SEP] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] আমাদের দেশের হাইব্রিড ধান বীজের বড় জোগানদার কোন দেশ [SEP]'

'>>> [CLS] বিশ্বের অন্যতম প্রধান চাল রপ্তানিকারক দেশ কোনটি [SEP] [PAD] [PAD]'

'>>> [CLS] IRRI এর সদর দপ্তর কোথায় অবস্থিত [SEP] [PAD] [PAD] [PAD]'

'>>> [CLS] কোন দেশের জনসংখ্যা তুলনামূলকভাবে কম [SEP] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] সুপার হাইব্রিড ধানের উদ্ভাবক কোন দেশ [SEP] [PAD] [PAD] [PAD]'

'>>> [CLS] BRRI কোথায় অবস্থিত [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'


In [33]:
downsampled_dataset = tokenized_dataset.train_test_split(
    #train_size=0.8, seed=42,
    train_size=0.8, seed=30
)
downsampled_dataset

Loading cached split indices for dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-1513888b3417f440\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-f3483ffde9fe5bad.arrow and C:\Users\arifa\.cache\huggingface\datasets\csv\default-1513888b3417f440\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-99b75ff5daae7438.arrow


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2666
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 667
    })
})

In [34]:
from torch.utils.data import DataLoader
# batch_size = 4
batch_size = 32

train_dataloader = DataLoader(
    downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    downsampled_dataset["test"], batch_size=batch_size, collate_fn=data_collator
)

In [35]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    break

{'labels': torch.Size([32]), 'input_ids': torch.Size([32, 13, 50]), 'token_type_ids': torch.Size([32, 13]), 'attention_mask': torch.Size([32, 13])}


In [36]:
import torch
with torch.no_grad():
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)

tensor(1.8606) torch.Size([32, 6])


In [37]:
predictions = torch.argmax(outputs.logits, dim=-1)

In [38]:
predictions

tensor([1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        2, 4, 4, 4, 4, 4, 4, 4])

In [39]:
from transformers import AdamW

# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = AdamW(model.parameters(), lr=3e-5)

In [40]:
from transformers import get_scheduler

num_epochs = 7
# num_epochs = 6
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    #num_warmup_steps=0.1 * num_training_steps,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

588


In [41]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [42]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/588 [00:00<?, ?it/s]

In [43]:
# !pip install evaluate

In [44]:
# !pip install sklearn

In [45]:
# !pip install -U scikit-learn scipy matplotlib

In [46]:
import evaluate

metric = evaluate.load("f1")
# results = f1_metric.compute(predictions=[0, 1], references=[0, 1], average="macro")
# print(results)

In [47]:
model.eval()
y_preds = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_preds.extend(predictions.cpu())
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute(average="macro")

{'f1': 0.9682862258025624}

In [48]:
y_true = downsampled_dataset["test"]["labels"]

In [49]:
from sklearn.metrics import classification_report
target_names = ['ABBREVIATION', 'DESCRIPTION', 'ENTITY', 'HUMAN', 'LOCATION', 'NUMERIC']
print(classification_report(y_true, y_preds,target_names=target_names))

              precision    recall  f1-score   support

ABBREVIATION       1.00      0.99      1.00       101
 DESCRIPTION       0.93      0.93      0.93        29
      ENTITY       0.94      0.98      0.96        90
       HUMAN       0.98      0.98      0.98       124
    LOCATION       0.97      0.97      0.97       123
     NUMERIC       0.98      0.97      0.98       200

    accuracy                           0.97       667
   macro avg       0.97      0.97      0.97       667
weighted avg       0.97      0.97      0.97       667



In [52]:
from datasets import DatasetDict

In [53]:
from transformers import set_seed

# set_seed(42)
set_seed(30)

In [54]:
scores = list()
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold

from datasets import load_dataset

# First make the kfold object
folds = StratifiedKFold(n_splits=5)

tokenized_dataset = tokenized_dataset.shuffle(seed=30)

# Now make our splits based off of the labels. 
# We can use `np.zeros()` here since it only works off of indices, we really care about the labels
splits = folds.split(np.zeros(tokenized_dataset.num_rows), tokenized_dataset["labels"])

# In this case I'm overriding the train/val/test
for train_idxs, val_idxs in splits:
    fold_dataset = DatasetDict({
    "train":tokenized_dataset.select(train_idxs),
    "validation":tokenized_dataset.select(val_idxs),
    })
    
    from torch.utils.data import DataLoader
    batch_size = 32
    
    train_dataloader = DataLoader(
        fold_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        fold_dataset["validation"], batch_size=batch_size, collate_fn=data_collator
    )
    
    #### LOADING BERT FOR CLASSIFICATION ####
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
    model = BertForSequenceClassification(config=config)
    
    #### REPLACING BERT WITH CHARACTER_BERT ####

    character_bert_model = CharacterBertModel.from_pretrained(\
        "E:\Documents\Character Bert\Question Classification\character-bert")
    model.bert = character_bert_model
    
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=3e-5)

    num_epochs = 6
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)
    
    #train model on each fold
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)    
    
    #validation on each fold
    model.eval()
    metric = evaluate.load("f1")
    
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    
    f1_score = metric.compute(average="macro")
    scores.append(f1_score['f1'])
    print(f1_score)

Loading cached shuffled indices for dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-1513888b3417f440\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-f8ea70031e1c6c88.arrow
Some weights of the model checkpoint at E:\Documents\Character Bert\Question Classification\character-bert were not used when initializing CharacterBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertMo

504


  0%|          | 0/504 [00:00<?, ?it/s]

{'f1': 0.9669699972799092}


Some weights of the model checkpoint at E:\Documents\Character Bert\Question Classification\character-bert were not used when initializing CharacterBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


504


  0%|          | 0/504 [00:00<?, ?it/s]

{'f1': 0.9672157625936063}


Some weights of the model checkpoint at E:\Documents\Character Bert\Question Classification\character-bert were not used when initializing CharacterBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


504


  0%|          | 0/504 [00:00<?, ?it/s]

{'f1': 0.9631558232213994}


Some weights of the model checkpoint at E:\Documents\Character Bert\Question Classification\character-bert were not used when initializing CharacterBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


504


  0%|          | 0/504 [00:00<?, ?it/s]

{'f1': 0.9659492956466419}


Some weights of the model checkpoint at E:\Documents\Character Bert\Question Classification\character-bert were not used when initializing CharacterBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


504


  0%|          | 0/504 [00:00<?, ?it/s]

{'f1': 0.9583603242994895}


In [55]:
scores

[0.9669699972799092,
 0.9672157625936063,
 0.9631558232213994,
 0.9659492956466419,
 0.9583603242994895]

In [56]:
sum(scores)  / len(scores)

0.9643302406082093