In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
pip install gurobipy[matrixapi] "numpy<2"

In [None]:
# Step 1: Imports
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from sklearn.metrics import hamming_loss, classification_report

# Step 2: Load saved fine-grained model
model_path = "/content/drive/MyDrive/..."
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to("cuda")

In [None]:
# Step 3: Load and filter Jigsaw dataset
df = pd.read_csv("/content/drive/MyDrive/...")
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df[label_cols] = df[label_cols].fillna(0).astype(int)
df["toxic_binary"] = df[label_cols].max(axis=1)
df_fine = df[df["toxic_binary"] == 1][["comment_text"] + label_cols]

# Step 4: Split + Save splits
from sklearn.model_selection import train_test_split
train_df, eval_df = train_test_split(df_fine, test_size=0.1, random_state=42)
train_df.to_csv("/content/drive/MyDrive/train...", index=False)
eval_df.to_csv("/content/drive/MyDrive/eval...", index=False)

In [None]:
# Step 5: Convert eval split to HuggingFace Dataset
eval_fine = Dataset.from_pandas(eval_df)

# Step 6: Tokenization
def tokenize(example):
    tokens = tokenizer(example["comment_text"], padding="max_length", truncation=True, max_length=256)
    tokens["labels"] = [example[label] for label in label_cols]
    return tokens

eval_fine = eval_fine.map(tokenize)
eval_fine = eval_fine.with_format("python")
eval_fine = eval_fine.map(lambda x: {"labels": np.array(x["labels"], dtype=np.float32)})
eval_fine.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
# Step 7: Custom DataLoader
def collate_fn(batch):
    return {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "labels": torch.stack([torch.tensor(np.asarray(x["labels"], dtype=np.float32)) for x in batch])
    }

eval_loader = DataLoader(eval_fine, batch_size=32, collate_fn=collate_fn)



In [None]:
# Step 8: Predict
def sigmoid(x): return 1 / (1 + np.exp(-x))
def binarize(logits, threshold=0.5): return (sigmoid(logits) > threshold).astype(int)

model.eval()
all_logits, all_labels = [], []

with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].cpu().numpy()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()

        all_logits.append(logits)
        all_labels.append(labels)

logits = np.vstack(all_logits)
true_labels = np.vstack(all_labels)
pred_labels = binarize(logits)

In [None]:
# Step 9: Save predictions to CSV
output_df = pd.DataFrame({
    "comment_text": eval_df["comment_text"].values
})
for i, col in enumerate(label_cols):
    output_df[f"true_{col}"] = true_labels[:, i]
    output_df[f"pred_{col}"] = pred_labels[:, i]

output_df.to_csv("/content/drive/MyDrive/...", index=False)
print("Predictions saved to Drive")

# Step 10: Evaluation
hloss = hamming_loss(true_labels, pred_labels)
print(f"Hamming Loss: {hloss:.4f}")

report = classification_report(true_labels, pred_labels, target_names=label_cols, digits=4, zero_division=0)
print(report)

#save report
#the 29th one is the last one
with open("/content/drive/MyDrive/...", "w") as f:
    f.write(report)
print("Classification report saved.")

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Get confusion matrices for each label
cm_per_label = multilabel_confusion_matrix(true_labels, pred_labels)

# Plot each one
for i, label in enumerate(label_cols):
    cm = cm_per_label[i]
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Not " + label, label], yticklabels=["Not " + label, label])
    plt.title(f"Confusion Matrix: {label}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()


Show Label Distribution in Train and Test Splits

In [None]:
print("🔹 Label distribution in TRAIN split:")
print(train_df[label_cols].sum().sort_values(ascending=False))

print("\n🔹 Label distribution in TEST split:")
print(eval_df[label_cols].sum().sort_values(ascending=False))


In [None]:
print("🔹 Relative label % in TRAIN split:")
print((train_df[label_cols].sum() / len(train_df)).sort_values(ascending=False).apply(lambda x: f"{x:.2%}"))

print("\n🔹 Relative label % in TEST split:")
print((eval_df[label_cols].sum() / len(eval_df)).sort_values(ascending=False).apply(lambda x: f"{x:.2%}"))
