In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [None]:
df = pd.read_csv("./Data/Hing_labelled_data.csv.csv")
df = df[df.text.notnull()]
df

In [None]:
from datasets import load_dataset

ds_english = load_dataset("hate_speech18")
ds_english = ds_english.filter(lambda x: x['label'] < 2).remove_columns(["user_id", "subforum_id", "num_contexts"])
df_english = pd.DataFrame(ds_english['train'])
df_english

In [None]:
df = pd.concat([df, df_english])
df

In [None]:
# save to csv file
df.to_csv("./Data/Eng+Hing_labelled_data.csv", index=False)

In [None]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds

In [None]:
# split data
training_ds = ds.shuffle(seed=42).select(range(int(len(df) * 0.9)))
eval_ds = ds.shuffle(seed=42).select(range(int(len(df) * 0.9), len(df)))

lbl0_count = len(training_ds.filter(lambda x: x["label"] == 0))
lbl1_count = len(training_ds.filter(lambda x: x["label"] == 1))
lbl0_count, lbl1_count

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# import models and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/muril-large-cased")
model = AutoModelForSequenceClassification.from_pretrained("google/muril-large-cased", num_labels=2).to(device)

model

In [None]:
# freeze the bert model

for param in model.bert.parameters():
    param.requires_grad = False

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_training_ds = training_ds.map(tokenize_function, batched=True)
tokenized_eval_ds = eval_ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# remove rows that are longer than 512

tokenized_training_ds = tokenized_training_ds.filter(lambda x: len(x["input_ids"]) <= 512)
tokenized_eval_ds = tokenized_eval_ds.filter(lambda x: len(x["input_ids"]) <= 512)

print(len(tokenized_training_ds), len(tokenized_eval_ds))
tokenized_training_ds

In [None]:
# Remove unnecessary columns

tokenized_training_ds = tokenized_training_ds.remove_columns(["text", "__index_level_0__"])
tokenized_training_ds = tokenized_training_ds.rename_column("label", "labels")
tokenized_training_ds.set_format("torch")

tokenized_eval_ds = tokenized_eval_ds.remove_columns(["text", "__index_level_0__"])
tokenized_eval_ds = tokenized_eval_ds.rename_column("label", "labels")
tokenized_eval_ds.set_format("torch")

print(tokenized_training_ds.column_names, tokenized_eval_ds.column_names)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_training_ds, shuffle=True, batch_size=32, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_eval_ds, batch_size=32, collate_fn=data_collator
)

for batch in train_dataloader:
    break
    
outputs = model(**{k:v.to(device) for k,v in batch.items()})
print(outputs.loss, outputs.logits.shape)
{k: v.shape for k, v in batch.items()}

In [None]:
from transformers import get_scheduler
from transformers import AdamW

num_epochs = 10
lr = 4e-4
num_training_steps = num_epochs * len(train_dataloader)

optimizer = AdamW(model.parameters(), lr=lr)
print(num_training_steps)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

train_labels = torch.tensor(training_ds['label'])
n_0 = train_labels[train_labels == 0].shape[0]
n_1 = train_labels[train_labels == 1].shape[0]
weights = torch.tensor([1000.0 / n_0, 1000.0 / n_1])
print(weights)
def compute_loss(model, inputs, return_outputs=False, reduction='mean'):
    labels = inputs.get("labels")
    # forward pass
    outputs = model(**inputs)
    logits = outputs.get('logits')
    # compute loss
    loss_fct = nn.CrossEntropyLoss(weight=weights.to(torch.float).to(device), reduction=reduction)
    loss = loss_fct(logits.view(-1, 2), labels.view(-1))
    
    return (loss, outputs) if return_outputs else loss

In [None]:
compute_loss(model, {k:v.to(device) for k,v in batch.items()}, True, 'none')

In [None]:
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([100.0, 1.0], dtype=torch.float))
loss_fct(torch.tensor([[10000.0, 1.0]]), torch.tensor([[0.0, 1.0]]))

In [None]:
# save the model

model_name = "hing+eng_classifier.pt"
path = f"./models/{model_name}"

In [None]:
# load model

model = torch.load(path)

In [None]:
import torch
from torch import nn

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

import evaluate
import wandb
from tqdm.auto import tqdm

run = wandb.init(
    # Set the project where this run will be logged
    project="my-awesome-project",
    # Track hyperparameters and run metadata
    config={}
)

def eval_func(get_worst_examples=False):
    metric = evaluate.load("accuracy")
    model.eval()
    losses = []
    pred_logit0_count = 0
    pred_logit1_count = 0
    loss_by_examples = []
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            loss, outputs = compute_loss(model, batch, return_outputs=True, reduction='none' if get_worst_examples else 'mean')

        logits = outputs.logits

        if not get_worst_examples:
            losses.append(loss.item())
        else:
            mylosses = loss.cpu().numpy().tolist()
            mylogits = outputs['logits'].argmax(dim=-1).cpu().numpy().tolist()
            for inp, lbl, l, pred in zip(batch['input_ids'], batch['labels'], mylosses, mylogits):
                loss_by_examples.append((l, inp, lbl.item(), pred))
            losses.append(loss.mean().item())
        
        predictions = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
        for pred in predictions:
            if pred == 0:
                pred_logit0_count += 1
            else:
                pred_logit1_count += 1
        metric.add_batch(predictions=predictions, references=batch["labels"])

    metric_compute = metric.compute()
    avg = sum(losses)/len(losses)
    
    metrics = {
        'average_eval_loss': avg,
        'eval_accuracy': metric_compute,
        'pred_logit0_count': pred_logit0_count,
        'pred_logit1_count': pred_logit1_count
    }
    print(metrics)
    wandb.log(metrics)
    
    if get_worst_examples:
        loss_by_examples = sorted(loss_by_examples, key=lambda x: -x[0])
        
        worst_five = loss_by_examples[:5]
        best_five = [x for x in loss_by_examples if x[2] != x[3]][-5:]
        worst_five = [(l, tokenizer.decode(inp, skip_special_tokens=True), lbl, pred) for l, inp, lbl, pred in worst_five]
        best_five = [(l, tokenizer.decode(inp, skip_special_tokens=True), lbl, pred) for l, inp, lbl, pred in best_five]
        return worst_five, best_five

progress_bar = tqdm(range(num_training_steps))
i = 0

model.train()
model.zero_grad()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        loss, outputs = compute_loss(model, batch, return_outputs=True)
        if i % 10 == 0:
            wandb.log({
                'trainloss': loss,
                'step': i,
                'epoch': epoch,
                'lr': lr, #.get_last_lr()
            })
        if i % 100 == 0:
            eval_func()
            torch.save(model, path)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        i += 1

In [None]:
eval_func(True)

In [None]:
torch.save(model, path)

**LOAD SAVED MODEL**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "hing+eng_classifier.pt"
path = f"./models/{model_name}"
tokenizer = AutoTokenizer.from_pretrained("google/muril-large-cased")

model = torch.load(path, map_location=device)
model

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
# Input text
inp = "I hate you!"

def predict_input(inp):
    model.eval()
    tokenized_inp = tokenizer(inp, return_tensors='pt').to(device)
    print(tokenized_inp)
    print(tokenizer.decode(tokenized_inp['input_ids'][0]))
    output = model(**tokenized_inp, output_hidden_states=True)
    print(output.hidden_states[-1].mean())
    logits = output.logits
    print(logits)
    output = logits.argmax().item()
    if output == 0: return "Not Hate Speech!"
    else: return "Hate Speech!"

predict_input(inp)