# Setup Initialize

In [2]:
# %pip install transformers datasets torch
# %pip install hf_xet
# %pip install transformers[torch]
# %pip install accelerate
# %pip install --upgrade transformers accelerate
# %pip install --upgrade torch datasets
# %pip install scikit-learn

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import ClassLabel

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="binary"),
        "recall": recall_score(labels, preds, average="binary"),
        "f1": f1_score(labels, preds, average="binary"),
    }

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [24]:
data = int(input())
suffixes = ["synth", "twitter", "twitch"]
suffix = suffixes[data]
dataset_paths = {}

dataset_paths["synth"] = os.path.abspath("../data/synthesized_twitch_chats.csv")
dataset_paths["twitter"] = os.path.abspath("../Data/hate_speech_and_offensive_language_dataset/processed_labeled_data.csv")
# dataset_paths["twitch"] = dataset_path_twitch

In [None]:
dataset = load_dataset("csv", data_files=dataset_paths[suffix])
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

dataset = dataset.cast_column("labels", ClassLabel(names=["not_offensive", "offensive"]))

dataset = dataset["train"].train_test_split(test_size=0.2, seed=42, stratify_by_column="labels")

def tokenize(batch):
    return tokenizer(batch["message"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Generating train split: 6243 examples [00:00, 174095.71 examples/s]
Casting the dataset: 100%|██████████| 6243/6243 [00:00<00:00, 286510.34 examples/s]
Map: 100%|██████████| 4994/4994 [00:01<00:00, 3559.86 examples/s]
Map: 100%|██████████| 1249/1249 [00:00<00:00, 3979.78 examples/s]


In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['message', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4994
    })
    test: Dataset({
        features: ['message', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1249
    })
})

# Train BERT Model

In [None]:
if os.path.exists(f"./saved_model/model_{suffix}") and os.path.exists(f"./saved_model/tokenizer_{suffix}"):
    model = BertForSequenceClassification.from_pretrained(f"./saved_model/model_{suffix}")
    tokenizer = BertTokenizer.from_pretrained(f"./saved_model/tokenizer_{suffix}")
else:
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=2,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model(f"./saved_model/model_{suffix}")
    tokenizer.save_pretrained(f"./saved_model/tokenizer_{suffix}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 

# Model Evaluation

In [17]:
trainer.evaluate()



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'eval_loss': 0.7162364721298218,
 'eval_accuracy': 0.5070422535211268,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_runtime': 0.813,
 'eval_samples_per_second': 87.328,
 'eval_steps_per_second': 11.07,
 'epoch': 2.0}

In [18]:
text = "motherfucker"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
pred = outputs.logits.argmax(dim=1).item()
print("Offensive" if pred == 1 else "Not Offensive")

Not Offensive
