In [None]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\\midas-discourse\\hi\\train.json", \
                             split="train")

In [None]:
val_dataset = load_dataset("json", data_files="..\datasets\\midas-discourse\\hi\\val.json", \
                             split="train")

In [None]:
test_dataset = load_dataset("json", data_files="..\datasets\\midas-discourse\\hi\\test.json", \
                             split="train")

In [None]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [None]:
datasets

In [None]:
datasets["train"][:5]

In [None]:
datasets.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = datasets["train"]["Discourse Mode"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
datasets.reset_format()

In [None]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer

tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [None]:
def tokenize_function(example):
    return tokenizer(example['Sentence'], truncation=True)
    #return tokenizer(example["text"], truncation=True, max_length=128)

In [None]:
datasets

In [None]:
from transformers import DataCollatorWithPadding

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=['Story_no', 'Sentence', 'id'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets

In [None]:
def assign_label(example):
    mapping = {
    "Descriptive":0,
    "Narrative":1,
    "Dialogue":2,
    "Argumentative":3,
    "Informative":4,
    "Other":5
    }
    example['labels'] = mapping[example['Discourse Mode']]
    return example

In [None]:
tokenized_datasets = tokenized_datasets.map(assign_label).remove_columns('Discourse Mode')
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

In [None]:
samples = [tokenized_datasets["train"][i] for i in range(5)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
datasets["train"][:5]

In [None]:
from transformers import set_seed

set_seed(30)
# set_seed(42)

In [None]:
#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
model = BertForSequenceClassification(config=config)

In [None]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

In [None]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
model.bert = character_bert_model

In [None]:
import numpy as np
import evaluate

metric_fun = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    metric_result = metric_fun.compute(references=labels, predictions=predictions)
    return {
        "accuracy": metric_result["accuracy"],
    }

In [None]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments

# batch_size = 16
batch_size = 32
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size


training_args = TrainingArguments(
    report_to = None,
    output_dir="models/bert-unigram-hindi-classifier",
    overwrite_output_dir=True,
    save_strategy="no",
    evaluation_strategy="epoch",
    #learning_rate=3e-5,
    learning_rate=2e-5,
    weight_decay=0.01,
    #weight_decay=0.02,
    #warmup_ratio = 0.1,
    #warmup_ratio = 0.05,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    #num_train_epochs=4,
    #push_to_hub=True,
    metric_for_best_model="accuracy",
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
from transformers import EarlyStoppingCallback

early_stop = EarlyStoppingCallback(1, 1.0)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    #train_dataset=entire_train,
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
# trainer.save_model()

In [None]:
y_preds, y_true, _ = trainer.predict(tokenized_datasets["test"])

In [None]:
y_preds = np.argmax(y_preds, axis=-1)

In [None]:
from sklearn.metrics import classification_report
target_names = ["Descriptive",
    "Narrative",
    "Dialogue",
    "Argumentative",
    "Informative",
    "Other"]
print(classification_report(y_true, y_preds,target_names=target_names))

In [None]:
import matplotlib.pyplot as plt
from seaborn import heatmap
from sklearn.metrics import confusion_matrix

#plot heatmap of confusion matrix
mat = confusion_matrix(y_true, y_preds)
heatmap(mat, cmap="Pastel1_r", fmt="d", xticklabels=target_names, yticklabels=target_names, annot=True)

#add overall title to plot
plt.title('Confusion matrix for DA', fontsize = 12) # title with fontsize 20

In [None]:
misclassified = [i for i in range(len(y_preds)) if ((y_preds[i] != y_true[i]) and (y_true[i]==4) and (y_preds[i]==0))]

In [None]:
misclassified