In [4]:
pip install pandas numpy scikit-learn tensorflow keras transformers matplotlib seaborn keras-tuner requests

Collecting tensorflow
  Using cached tensorflow-2.13.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting keras
  Using cached keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Collecting transformers
  Using cached transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting keras-tuner
  Using cached keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.1.21 (from tensorflow)
  Using cached flatbuffers-25.1.24-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow)
  Using cached gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting grpcio<2.0,

Step 2: Dataset Preparation

Step 3: Preprocessing

Step 4: Model Design
Model 1: LSTM with Pre-trained Embeddings

In [5]:
import pandas as pd
import numpy as np
import re
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-01-30 16:54:47.366479: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-30 16:54:47.389250: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
def load_data():
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    test_labels = pd.read_csv("test_labels.csv")

    # Merge and filter test data
    test_full = pd.merge(test_df, test_labels, on="id")
    mask = (test_full[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] != -1).any(axis=1)
    scored_test = test_full[mask]

    return train_df, scored_test

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+|[^a-zA-Z\s]", "", str(text))
    return text.lower().strip()

# Process datasets
train_df, test_df = load_data()
train_df["cleaned_text"] = train_df["comment_text"].apply(clean_text)
test_df["cleaned_text"] = test_df["comment_text"].apply(clean_text)

# Split training data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["cleaned_text"],
    train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]],
    test_size=0.2,
    random_state=42
)


In [7]:
class ToxicityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True).values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        encoding = self.tokenizer(
            text, max_length=self.max_len,
            padding="max_length", truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Initialize tokenizer and datasets
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = ToxicityDataset(train_texts, train_labels, tokenizer)
val_dataset = ToxicityDataset(val_texts, val_labels, tokenizer)
test_dataset = ToxicityDataset(test_df["cleaned_text"], test_df.iloc[:, 2:8], tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
class BiLSTM(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=64):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(hidden_dim * 2, 6)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

class GRUModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim=200, hidden_dim=128):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.gru = torch.nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(hidden_dim, 6)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x, _ = self.gru(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# BERT Model
bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=6,
    problem_type="multi_label_classification"
).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
device

device(type='cuda')

In [17]:
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10, patience=3):
    best_val_loss = float('inf')
    train_losses, val_losses = [], []

    for epoch in range(epochs):
        # Training
        model.train()
        epoch_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        train_loss = epoch_loss / len(train_loader)
        train_losses.append(train_loss)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids, attention_mask)
                val_loss += criterion(outputs, labels).item()
        val_loss = val_loss / len(val_loader)
        val_losses.append(val_loss)

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    return model, train_losses, val_losses


In [20]:
# Common parameters
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, num_workers=4, pin_memory=True)
criterion = torch.nn.BCEWithLogitsLoss()

# Train BiLSTM
bilstm = BiLSTM(tokenizer.vocab_size).to(device)
bilstm_optimizer = torch.optim.AdamW(bilstm.parameters(), lr=1e-3, weight_decay=1e-4)
bilstm, bilstm_train_loss, bilstm_val_loss = train_model(
    bilstm, train_loader, val_loader, bilstm_optimizer, criterion
)

In [12]:
# Common parameters
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
criterion = torch.nn.BCEWithLogitsLoss()

# Train BiLSTM
bilstm = BiLSTM(tokenizer.vocab_size).to(device)
bilstm_optimizer = torch.optim.AdamW(bilstm.parameters(), lr=1e-3, weight_decay=1e-4)
bilstm, bilstm_train_loss, bilstm_val_loss = train_model(
    bilstm, train_loader, val_loader, bilstm_optimizer, criterion
)

# Train GRU
gru = GRUModel(tokenizer.vocab_size).to(device)
gru_optimizer = torch.optim.AdamW(gru.parameters(), lr=1e-3)
gru, gru_train_loss, gru_val_loss = train_model(
    gru, train_loader, val_loader, gru_optimizer, criterion
)

# Train BERT
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()


KeyboardInterrupt: 

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            outputs = model(input_ids, attention_mask).sigmoid().cpu().numpy()
            preds = (outputs > 0.5).astype(int)
            all_preds.extend(preds)
            all_labels.extend(labels)

    results = {}
    for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        results[col] = {
            "precision": precision_score(all_labels[:, i], all_preds[:, i], average="macro"),
            "recall": recall_score(all_labels[:, i], all_preds[:, i], average="macro"),
            "f1": f1_score(all_labels[:, i], all_preds[:, i], average="macro")
        }

    results["macro_avg"] = {
        "precision": np.mean([v["precision"] for v in results.values()]),
        "recall": np.mean([v["recall"] for v in results.values()]),
        "f1": np.mean([v["f1"] for v in results.values()])
    }

    return results

# Evaluate all models
test_loader = DataLoader(test_dataset, batch_size=16)
print("BiLSTM Results:", evaluate_model(bilstm, test_loader))
print("GRU Results:", evaluate_model(gru, test_loader))
print("BERT Results:", evaluate_model(bert_model, test_loader))

In [None]:
def generate_iiif_annotations(df, num_samples=10):
    annotations = []
    for _, row in df.sample(num_samples).iterrows():
        annotation = {
            "@context": "http://iiif.io/api/presentation/3/context.json",
            "id": f"https://example.org/annotation/{row['id']}",
            "type": "Annotation",
            "motivation": "classifying",
            "body": {
                "type": "TextualBody",
                "value": str(row[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].to_dict()),
                "format": "text/plain"
            },
            "target": {
                "source": f"https://example.org/wiki_comments/{row['id']}.jpg",
                "selector": {"type": "FragmentSelector", "value": "xywh=0,0,500,500"}
            }
        }
        annotations.append(annotation)
    return annotations

with open("iiif_annotations.json", "w") as f:
    json.dump(generate_iiif_annotations(test_df), f)

In [None]:
# [9] Error Analysis
def analyze_errors(model, test_loader):
    model.eval()
    errors = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            outputs = model(input_ids, attention_mask).sigmoid().cpu().numpy()
            preds = (outputs > 0.5).astype(int)

            for i in range(len(preds)):
                if not np.array_equal(preds[i], labels[i]):
                    errors.append({
                        "text": test_df["cleaned_text"].iloc[i],
                        "true": labels[i],
                        "pred": preds[i]
                    })
    return errors[:10]  # Return first 10 errors

print("BERT Error Examples:", analyze_errors(bert_model, test_loader))


In [None]:
# [10] Visualization
def plot_metrics(model_name, train_loss, val_loss):
    plt.figure(figsize=(10, 5))
    plt.plot(train_loss, label=f"{model_name} Training Loss")
    plt.plot(val_loss, label=f"{model_name} Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

plot_metrics("BiLSTM", bilstm_train_loss, bilstm_val_loss)
plot_metrics("GRU", gru_train_loss, gru_val_loss)