In [None]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, \
CharacterBertForPreTraining, CharacterBertConfig, CharacterBertTokenizer
import torch
import os

In [None]:
model_config = CharacterBertConfig.from_pretrained('../data/character-bert')
model = CharacterBertForPreTraining(model_config)

In [None]:
# Load the actual checkpoint file
output_directory = "long-text/model"

checkpoint = torch.load(
    output_directory, map_location="cpu"
)

In [None]:
model.load_state_dict(checkpoint['model'], strict=True)

In [None]:
model.save_pretrained('character-bert')

In [None]:
from transformers import set_seed

# set_seed(42)
set_seed(30)

In [None]:
#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=6)  # binary classification
model = BertForSequenceClassification(config=config)

In [None]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

In [None]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Question Classification\character-bert")
model.bert = character_bert_model

In [None]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

In [None]:
from datasets import load_dataset

qa_dataset = load_dataset("csv", data_files="..\datasets\Bengali Question Classification.csv", split="train")

In [None]:
qa_dataset

In [None]:
qa_dataset.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = qa_dataset["Label"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
num_labels

In [None]:
max_token_length = max(qa_dataset['Text'].str.len())
max_token_length

In [None]:
count = qa_dataset['Text'].str.split().apply(len).value_counts()

In [None]:
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [None]:
# count

In [None]:
qa_dataset.reset_format()

In [None]:
tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [None]:
# text="পাটের জেনেটিক ম্যাপ কোন দেশের বিজ্ঞানী আবিষ্কার করেছেন\
# ভারতীয় কৃষিজ পণ্যের অন্যতম আমদানিকারক দেশ কোনটি বিশ্বের সর্ববৃহত্ জনসংখ্যার দেশ কোনটি কোন দেশে খাদ্য ঘাটতি নেই \
# আমাদের দেশের হাইব্রিড ধান বীজের বড় জোগানদার কোন দেশ"

In [None]:
# len(text.split())

In [None]:
# len(tokenizer(text)["input_ids"])

In [None]:
model

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"])

In [None]:
from transformers import DataCollatorWithPadding

tokenized_dataset = qa_dataset.map(tokenize_function, batched=True, remove_columns=["Text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset

In [None]:
def assign_label(example):
    mapping = {'ABBREVIATION':0, 'DESCRIPTION':1, 'ENTITY':2, 'HUMAN':3, 'LOCATION':4, 'NUMERIC':5}
    example['Label'] = mapping[example['Label']]
    return example

In [None]:
tokenized_dataset = tokenized_dataset.map(assign_label)
tokenized_dataset = tokenized_dataset.rename_column("Label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

In [None]:
samples = [tokenized_dataset[i] for i in range(10)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
downsampled_dataset = tokenized_dataset.train_test_split(
    #train_size=0.8, seed=42,
    train_size=0.8, seed=30
)
downsampled_dataset

In [None]:
from torch.utils.data import DataLoader
# batch_size = 4
batch_size = 32

train_dataloader = DataLoader(
    downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    downsampled_dataset["test"], batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()})
    break

In [None]:
import torch
with torch.no_grad():
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)

In [None]:
predictions = torch.argmax(outputs.logits, dim=-1)

In [None]:
predictions

In [None]:
from transformers import AdamW

# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 7
# num_epochs = 6
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    #num_warmup_steps=0.1 * num_training_steps,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# !pip install evaluate

In [None]:
# !pip install sklearn

In [None]:
# !pip install -U scikit-learn scipy matplotlib

In [None]:
import evaluate

metric = evaluate.load("f1")
# results = f1_metric.compute(predictions=[0, 1], references=[0, 1], average="macro")
# print(results)

In [None]:
model.eval()
y_preds = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_preds.extend(predictions.cpu())
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute(average="macro")

In [None]:
y_true = downsampled_dataset["test"]["labels"]

In [None]:
from sklearn.metrics import classification_report
target_names = ['ABBREVIATION', 'DESCRIPTION', 'ENTITY', 'HUMAN', 'LOCATION', 'NUMERIC']
print(classification_report(y_true, y_preds,target_names=target_names))

In [None]:
from datasets import DatasetDict

In [None]:
from transformers import set_seed

# set_seed(42)
set_seed(30)

In [None]:
scores = list()
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold

from datasets import load_dataset

# First make the kfold object
folds = StratifiedKFold(n_splits=5)

tokenized_dataset = tokenized_dataset.shuffle(seed=30)

# Now make our splits based off of the labels. 
# We can use `np.zeros()` here since it only works off of indices, we really care about the labels
splits = folds.split(np.zeros(tokenized_dataset.num_rows), tokenized_dataset["labels"])

# In this case I'm overriding the train/val/test
for train_idxs, val_idxs in splits:
    fold_dataset = DatasetDict({
    "train":tokenized_dataset.select(train_idxs),
    "validation":tokenized_dataset.select(val_idxs),
    })
    
    from torch.utils.data import DataLoader
    batch_size = 32
    
    train_dataloader = DataLoader(
        fold_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        fold_dataset["validation"], batch_size=batch_size, collate_fn=data_collator
    )
    
    #### LOADING BERT FOR CLASSIFICATION ####
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
    model = BertForSequenceClassification(config=config)
    
    #### REPLACING BERT WITH CHARACTER_BERT ####

    character_bert_model = CharacterBertModel.from_pretrained(\
        "E:\Documents\Character Bert\Question Classification\character-bert")
    model.bert = character_bert_model
    
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=3e-5)

    num_epochs = 6
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)
    
    #train model on each fold
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)    
    
    #validation on each fold
    model.eval()
    metric = evaluate.load("f1")
    
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    
    f1_score = metric.compute(average="macro")
    scores.append(f1_score['f1'])
    print(f1_score)

In [None]:
scores

In [None]:
sum(scores)  / len(scores)