In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
torch.cuda.get_device_name()

In [None]:
# import transformers
# transformers.__version__

In [None]:
from datasets import load_dataset

hate_speech_dataset = load_dataset("csv", data_files="../datasets/Hindi Hate Speech.csv", split="train")

In [None]:
hate_speech_dataset

In [None]:
hate_speech_dataset.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = hate_speech_dataset["Label"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
max_token_length = max(hate_speech_dataset['Text'].str.len())
max_token_length

In [None]:
hate_speech_dataset.reset_format()

In [None]:
from transformers import set_seed

set_seed(30)
# set_seed(42)

In [None]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification

tokenizer = PreTrainedTokenizerFast.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi")

In [None]:
model =  AutoModelForSequenceClassification.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi", num_labels=num_labels)

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

In [None]:
from transformers import DataCollatorWithPadding

tokenized_dataset = hate_speech_dataset.map(tokenize_function, batched=True, remove_columns=["Text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset

In [None]:
for examples in temp.select(range(100)):
    if(examples["Label"]=='NOT'):
        print(tokenizer.decode(examples['input_ids']))

In [None]:
for examples in hate_speech_dataset:
    if ("आफ्नो" in examples["Text"]) and (examples["Label"]=='NOT'):
        print(examples["Text"])

In [None]:
def assign_label(example):
    mapping = {"HOF":0, "NOT":1}
    example['Label'] = mapping[example['Label']]
    return example

In [None]:
tokenized_dataset = tokenized_dataset.map(assign_label)
tokenized_dataset = tokenized_dataset.rename_column("Label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

In [None]:
samples = [tokenized_dataset[i] for i in range(10)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
hate_speech_dataset[:10]

In [None]:
downsampled_dataset = tokenized_dataset.train_test_split(
    train_size=0.8, seed=42
)
downsampled_dataset

In [None]:
from torch.utils.data import DataLoader
# batch_size = 32
batch_size = 32

train_dataloader = DataLoader(
    downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    downsampled_dataset["test"], batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=4e-5)
# optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
from transformers import get_scheduler

# num_epochs = 3
num_epochs = 6
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# !pip install evaluate

In [None]:
# !pip install -U scikit-learn scipy matplotlib

In [None]:
import evaluate

metric = evaluate.load("f1")
# results = f1_metric.compute(predictions=[0, 1], references=[0, 1], average="macro")
# print(results)

In [None]:
model.eval()
y_preds = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_preds.extend(predictions.cpu())
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute(average="macro")

In [None]:
y_true = downsampled_dataset["test"]["labels"]

In [None]:
from sklearn.metrics import classification_report
target_names = ["HOF", "NOT"]
print(classification_report(y_true, y_preds,target_names=target_names))

In [None]:
import matplotlib.pyplot as plt
from seaborn import heatmap
from sklearn.metrics import confusion_matrix

#plot heatmap of confusion matrix
mat = confusion_matrix(y_true, y_preds)
heatmap(mat, cmap="Pastel1_r", fmt="d", xticklabels=target_names, yticklabels=target_names, annot=True)

#add overall title to plot
plt.title('Confusion matrix for Hate Speech', fontsize = 12) # title with fontsize 20

In [None]:
temp = hate_speech_dataset.filter(lambda x:x if x["Label"]=='NOT' else None)

In [None]:
temp[:20]

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
scores = list()
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold

from datasets import DatasetDict

# First make the kfold object
folds = StratifiedKFold(n_splits=5)

# tokenized_dataset = tokenized_dataset.shuffle(seed=42)
tokenized_dataset = tokenized_dataset.shuffle(seed=30)

# Now make our splits based off of the labels. 
# We can use `np.zeros()` here since it only works off of indices, we really care about the labels
splits = folds.split(np.zeros(tokenized_dataset.num_rows), tokenized_dataset["labels"])

# In this case I'm overriding the train/val/test
for train_idxs, val_idxs in splits:
    fold_dataset = DatasetDict({
    "train":tokenized_dataset.select(train_idxs),
    "validation":tokenized_dataset.select(val_idxs),
    })
    
    # batch_size = 32
    batch_size = 16

    train_dataloader = DataLoader(
        fold_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        fold_dataset["validation"], batch_size=batch_size, collate_fn=data_collator
    )
    
    model =  AutoModelForSequenceClassification.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi", num_labels=num_labels)
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=3e-5)
    #optimizer = AdamW(model.parameters(), lr=4e-5)

    num_epochs = 6
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    f1_score = metric.compute(average="macro")
    scores.append(f1_score['f1'])
    print(f1_score)
    
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
scores

In [None]:
sum(scores)  / len(scores)