In [None]:
%pip install transformers torch pandas numpy kagglehub hf_transfer scikit-learn matplotlib rich

In [None]:
import torch
import transformers
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)

In [3]:
import kagglehub

path = kagglehub.dataset_download("andrewmvd/cyberbullying-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/andrewmvd/cyberbullying-classification?dataset_version_number=1...


100%|██████████| 2.82M/2.82M [00:00<00:00, 4.17MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/andrewmvd/cyberbullying-classification/versions/1





In [4]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

110

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
import torch, gc
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import os
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from transformers import logging
import matplotlib.pyplot as plt
from rich.console import Console
from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn, TextColumn
from transformers import get_linear_schedule_with_warmup
from torch.amp import autocast, GradScaler
import warnings
import torch.nn.functional as F

warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")
logging.set_verbosity_error()

console = Console()

torch.cuda.empty_cache()
gc.collect()
print("Cleared GPU cache ✅")

os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


path = '/root/.cache/kagglehub/datasets/andrewmvd/cyberbullying-classification/versions/1/cyberbullying_tweets.csv'
df = pd.read_csv(path, sep=',')


# Map each label as an integer
label_map = {
    'not_cyberbullying' : 0,
    'gender' : 1,
    'religion' : 2,
    'other_cyberbullying' : 3,
    'age' : 4,
    'ethnicity' : 5
}

df["label"] = df["cyberbullying_type"].map(label_map)

num_labels=df['label'].nunique()


def preprocess(tweet):
    return tokenizer(tweet, truncation=True, padding='max_length', max_length=128)


tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoded_tweets = df['tweet_text'].apply(preprocess)

df["input_ids"] = encoded_tweets.apply(lambda x: x["input_ids"])
df["attention_mask"] = encoded_tweets.apply(lambda x: x["attention_mask"])

# Convert to tensors
inputs = torch.tensor(df["input_ids"].tolist())
masks = torch.tensor(df["attention_mask"].tolist())
labels = torch.tensor(df["label"].tolist())

dataset = TensorDataset(inputs, masks, labels)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

Cleared GPU cache ✅


In [None]:
# Define Focal Loss method
def focal_loss(inputs, targets, gamma=2.0):
    ce_loss = F.cross_entropy(inputs, targets, reduction='none')
    pt = torch.exp(-ce_loss)
    loss = (1 - pt) ** gamma ** ce_loss

    return loss.mean()



# Train model
def train_model(num_epochs, scheduler):
    scaler = GradScaler()
    for epoch in range(num_epochs):
        model.train()
        total_loss=0
        console.rule(f"[bold green]Epoch {epoch+1}/{num_epochs}", align='center')
        with Progress(
            TextColumn("[bold dodger_blue1]{task.description}"),
            BarColumn(),
            TextColumn("[bold dodger_blue1]{task.percentage:>3.1f}%[/bold dodger_blue1]"),
            "•",
            TimeElapsedColumn(),
            "•",
            TimeRemainingColumn(),
            console=console,
            transient=True,
        ) as progress:
            task = progress.add_task("Training", total=len(train_loader))
            
            for batch in train_loader:
                batch = [x.to(device) for x in batch]
                optimizer.zero_grad()
                with autocast(device_type="cuda", dtype=torch.bfloat16): 
                    input_ids, attention_mask, labels = batch
                    outputs = model(
                        input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels
                        )
                    loss = focal_loss(outputs.logits, labels)
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                
                total_loss += loss.item()

                progress.advance(task)
                progress.update(task, description=f"Training (loss={loss.item():.4f})")
        avg_loss = total_loss / len(train_loader)
        console.print(f'[orchid1]Epoch {epoch+1} completed | Average loss: [bold bright_yellow]{avg_loss:.4f}[/bold bright_yellow]\n')
    return model


# Evaluate model
def eval_model(model):
    model.eval()
    all_labels, all_preds = [], []

    with torch.no_grad():
        for batch in test_loader:
            batch = [x.to(device) for x in batch]
            input_ids, attention_mask, labels = batch 

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)


            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            
            
    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds, labels=list(range(num_labels)))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues', xticks_rotation=45)
    plt.show()

    # Metrics
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="macro")

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    print("\nDetailed report:\n", classification_report(all_labels, all_preds))

    return acc, f1

In [None]:
if __name__ == "__main__":
    epochs = 10
    best_lr = 3e-5
    num_training_steps = len(train_loader) * epochs

    model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased", num_labels=num_labels).to(device)

    for param in model.bert.embeddings.parameters():
        param.requires_grad = False
    for layer in model.bert.encoder.layer[:8]:
        for param in layer.parameters():
            param.requires_grad = False

    optimizer = AdamW(model.parameters(), lr=best_lr)
    scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=int(0.1 * num_training_steps),
                num_training_steps=num_training_steps
            )

    trained_model = train_model(epochs, scheduler)

    acc, f1 = eval_model(trained_model)

    print(f"\nAccuracy: {acc}")
    print(f"\nF1 Score: {f1}")

In [None]:
trained_model.save_pretrained('bert_cyberbullying_model')
tokenizer.save_pretrained('bert_cyberbullying_model')

import os
os.listdir('bert_cyberbullying_model')