In [None]:
!pip install -q transformers datasets accelerate bitsandbytes scikit-learn


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd

# Load Jigsaw Toxic Comment dataset
df = pd.read_csv("/content/drive/MyDrive/...")

# Define fine-grained labels
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df[label_cols] = df[label_cols].fillna(0).astype(int)


In [None]:
# New binary label: 1 if any fine-grained label is 1
df["toxic_binary"] = df[label_cols].max(axis=1)
print("Binary toxic label counts:")
print(df["toxic_binary"].value_counts())

df_binary = df[["comment_text", "toxic_binary"]]
df_fine = df[df["toxic_binary"] == 1][["comment_text"] + label_cols]


In [None]:
from datasets import Dataset

# Convert to Hugging Face Datasets
dataset_binary = Dataset.from_pandas(df_binary)
dataset_fine = Dataset.from_pandas(df_fine)

# Split each into train/test
binary_data = dataset_binary.train_test_split(test_size=0.1, seed=42)
fine_data = dataset_fine.train_test_split(test_size=0.1, seed=42)

train_binary = binary_data["train"]
eval_binary = binary_data["test"]

train_fine = fine_data["train"]
eval_fine = fine_data["test"]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "answerdotai/ModernBERT-base"

tokenizer_binary = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model_binary = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, trust_remote_code=True)


In [None]:
def tokenize_binary(example):
    tokens = tokenizer_binary(example["comment_text"], padding="max_length", truncation=True, max_length=256)
    tokens["labels"] = int(example["toxic_binary"])
    return tokens

train_binary = train_binary.map(tokenize_binary)
eval_binary = eval_binary.map(tokenize_binary)

train_binary.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_binary.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
from transformers import TrainingArguments

binary_args = TrainingArguments(
    output_dir="./binary_results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    logging_steps=50,
    save_total_limit=2
)

from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_binary_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer_binary = Trainer(
    model=model_binary,
    args=binary_args,
    train_dataset=train_binary,
    eval_dataset=eval_binary,
    tokenizer=tokenizer_binary,
    compute_metrics=compute_binary_metrics
)

trainer_binary.train()

from sklearn.metrics import classification_report
import numpy as np

# Run predictions on the eval set
binary_preds = trainer_binary.predict(eval_binary)

# Get predicted class labels
logits = binary_preds.predictions
preds = np.argmax(logits, axis=1)

# True labels
true = binary_preds.label_ids

# Print classification report
print(classification_report(true, preds, target_names=["non-toxic", "toxic"], digits=4))



In [None]:
save_path = "/content/drive/MyDrive/..."
model_binary.save_pretrained(save_path)
tokenizer_binary.save_pretrained(save_path)
from sklearn.metrics import classification_report
import numpy as np

logits = binary_preds.predictions
preds = np.argmax(logits, axis=1)
true = binary_preds.label_ids

report = classification_report(true, preds, target_names=["non-toxic", "toxic"], digits=4)

# Save to Drive
with open("/content/drive/MyDrive/...", "w") as f:
    f.write(report)

print("âœ… Report saved to Drive.")
from google.colab import files
files.download(report_path)

df_fine.to_csv("/content/drive/MyDrive/...", index=False)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

binary_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/...")
binary_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/...")


In [None]:
# prompt: Reload df_fine

import pandas as pd
df_fine = pd.read_csv("/content/drive/MyDrive/...")


In [None]:
from datasets import Dataset

fine_dataset = Dataset.from_pandas(df_fine)
fine_split = fine_dataset.train_test_split(test_size=0.1, seed=42)

train_fine = fine_split["train"]
eval_fine = fine_split["test"]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "answerdotai/ModernBERT-base"

tokenizer_fine = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model_fine = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6, trust_remote_code=True)
def tokenize_and_format_fine(example):
    tokens = tokenizer_fine(example["comment_text"], padding="max_length", truncation=True, max_length=256)
    tokens["labels"] = [
        float(example["toxic"]),
        float(example["severe_toxic"]),
        float(example["obscene"]),
        float(example["threat"]),
        float(example["insult"]),
        float(example["identity_hate"])
    ]
    return tokens

train_fine = train_fine.map(tokenize_and_format_fine)
eval_fine = eval_fine.map(tokenize_and_format_fine)

train_fine.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_fine.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
from transformers import TrainingArguments, Trainer
import torch.nn as nn
import torch
from sklearn.metrics import classification_report
import numpy as np

class FineTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss
fine_args = TrainingArguments(
    output_dir="./fine_results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    logging_steps=50,
    save_total_limit=2
)
trainer_fine = FineTrainer(
    model=model_fine,
    args=fine_args,
    train_dataset=train_fine,
    eval_dataset=eval_fine,
    tokenizer=tokenizer_fine,
)

trainer_fine.train()

# Get predictions
preds = trainer_fine.predict(eval_fine)
logits = preds.predictions
true_labels = preds.label_ids

# Convert logits to binary labels using sigmoid + threshold
probs = torch.sigmoid(torch.tensor(logits)).numpy()
pred_labels = (probs >= 0.5).astype(int)

# Define label names
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Generate report
report = classification_report(true_labels, pred_labels, target_names=label_cols, digits=4, zero_division=0)
print(report)


In [None]:
with open("/content/drive/MyDrive/...", "w") as f:
    f.write(report)
    from google.colab import files
files.download("/content/drive/MyDrive/...")


In [None]:
model_fine.save_pretrained("/content/drive/MyDrive/...")
tokenizer_fine.save_pretrained("/content/drive/MyDrive/...")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load binary classifier
binary_model_path = "/content/drive/MyDrive/..."
tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(device)

# Load fine-grained classifier
fine_model_path = "/content/drive/MyDrive/..."
tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(device)


In [None]:
import numpy as np

label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def predict_toxicity(comment):
    # Stage 1: Binary prediction
    binary_inputs = tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        binary_outputs = model_binary(**binary_inputs)
        binary_pred = torch.argmax(binary_outputs.logits, dim=1).item()

    if binary_pred == 0:
        return {"binary": "non-toxic", "subtypes": None}

    # Stage 2: Fine-grained prediction
    fine_inputs = tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        fine_outputs = model_fine(**fine_inputs)
        probs = torch.sigmoid(fine_outputs.logits).cpu().numpy()[0]

    # Apply threshold of 0.5
    subtypes = {label: round(float(prob), 2) for label, prob in zip(label_cols, probs) if prob >= 0.5}

    return {"binary": "toxic", "subtypes": subtypes}


In [None]:
comment = "you are stupid."
result = predict_toxicity(comment)
print(result)
