In [None]:
from datasets import load_dataset


train_dataset = load_dataset("csv", data_files="..\datasets\soham-articles\\bn\\bn-train.csv", column_names=["labels","text"], split="train")

In [None]:
valid_dataset = load_dataset("csv", data_files="..\datasets\soham-articles\\bn\\bn-valid.csv", column_names=["labels","text"], split="train")

In [None]:
test_dataset = load_dataset("csv", data_files="..\datasets\soham-articles\\bn\\bn-test.csv", column_names=["labels","text"], split="train")

In [None]:
from datasets import Dataset, DatasetDict

news_datasets = DatasetDict()
news_datasets['train'] = train_dataset
news_datasets['test'] = test_dataset
news_datasets['validation'] = valid_dataset

In [None]:
news_datasets

In [None]:
train_dataset.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = train_dataset["labels"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
max_token_length = max(train_dataset['text'].str.len())
max_token_length

In [None]:
count = train_dataset['text'].str.split().apply(len).value_counts()

In [None]:
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

In [None]:
count

In [None]:
train_dataset.reset_format()

In [None]:
from transformers import set_seed

set_seed(30)
# set_seed(42)

In [None]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer

#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
model = BertForSequenceClassification(config=config)

In [None]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

In [None]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Question Classification\character-bert")
model.bert = character_bert_model

In [None]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

In [None]:
# import torch

# # Load the actual checkpoint file
# checkpoint = torch.load(
#     output_directory, map_location="cpu"
# )

In [None]:
# model.load_state_dict(checkpoint['model'], strict=True)

In [None]:
tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], max_length=128)

In [None]:
from transformers import DataCollatorWithPadding

tokenized_datasets = news_datasets.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets

In [None]:
# temp = tokenized_datasets.filter(lambda x:x if tokenizer.unk_token_id in x["input_ids"] else None)

In [None]:
# temp

In [None]:
# for sample in temp["train"]:
#     print(tokenizer.decode(sample["input_ids"]))

In [None]:
# "kolkata":4603,
# "state":2245,
# "national":1435,
# "sports":1289,
# "entertainment":1186,
# "international":526

In [None]:
def assign_label(example):
    mapping = {
        "kolkata":0,
        "state":1,
        "national":2,
        "sports":3,
        "entertainment":4,
        "international":5
    }
    example['labels'] = mapping[example['labels']]
    return example

In [None]:
tokenized_datasets = tokenized_datasets.map(assign_label)
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

In [None]:
samples = [tokenized_datasets["train"][i] for i in range(5)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
news_datasets["train"][:5]

In [None]:
from torch.utils.data import DataLoader
# batch_size = 16
batch_size = 32

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
import torch
with torch.no_grad():
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)

In [None]:
import numpy as np
import evaluate

metric_fun = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    metric_result = metric_fun.compute(references=labels, predictions=predictions)
    return {
        "accuracy": metric_result["accuracy"],
    }

In [None]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments

# batch_size = 16
batch_size = 32
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size


training_args = TrainingArguments(
    output_dir="models/bert-unigram-bengali-classifier",
    overwrite_output_dir=True,
    report_to = None,
    logging_dir= None,
    save_strategy="no",
    evaluation_strategy="epoch",
    #learning_rate=2e-5,
    learning_rate=3e-5,
    weight_decay=0.01,
    #weight_decay=0.02,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    #num_train_epochs=6,
    #push_to_hub=True,
    fp16=True,
)

In [None]:
# from datasets import concatenate_datasets

# entire_train = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["validation"]]) 

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    #train_dataset=entire_train,
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [None]:
trainer.evaluate(tokenized_datasets["validation"])

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
output_directory = "best-model"

In [None]:
torch.save(
    {
        'model': model.state_dict(),
    },
    output_directory
)

In [None]:
y_preds, y_true, _ = trainer.predict(tokenized_datasets["test"])

In [None]:
y_preds = np.argmax(y_preds, axis=-1)

In [None]:
from sklearn.metrics import classification_report
target_names = ["kolkata",
        "state",
        "national",
        "sports",
        "entertainment",
        "international"]

print(classification_report(y_true, y_preds,target_names=target_names))

In [None]:
# !pip install seaborn

In [None]:
import matplotlib.pyplot as plt
from seaborn import heatmap
from sklearn.metrics import confusion_matrix

#plot heatmap of confusion matrix
mat = confusion_matrix(y_true, y_preds)
heatmap(mat, cmap="Pastel1_r", fmt="d", xticklabels=target_names, yticklabels=target_names, annot=True)

#add overall title to plot
plt.title('Confusion matrix for QC', fontsize = 12) # title with fontsize 20

In [None]:
misclassified = [i for i in range(len(y_preds)) if ((y_preds[i] != y_true[i]) and (y_true[i]==5) and (y_preds[i]==2))]

In [None]:
misclassified = test_dataset.select(misclassified)

In [None]:
misclassified[:]

In [None]:
# model.save_pretrained("models/bert-unigram-bengali-classifier")

In [None]:
# trainer.save_model()

In [None]:
#### LOADING BERT FOR CLASSIFICATION ####

model = BertForSequenceClassification.from_pretrained("models/bert-unigram-bengali-classifier", num_labels=num_labels)

In [None]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained("models/bert-unigram-bengali-classifier")
model.bert = character_bert_model

In [None]:
model.bert.embeddings.word_embeddings

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("models/bert-unigram-bengali-classifier")
model.to("cuda")