In [None]:
# import opendatasets as od

# kaggle_data_url = "https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data"
# save_to = "datasets/jigsaw-toxic-comment"
# od.download(kaggle_data_url, save_to)

In [1]:
import pandas as pd

data = pd.read_csv("datasets/jigsaw-toxic-comment/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
data = pd.DataFrame(
    {
        "text": data.iloc[:, 1].values.tolist(),
        "labels": data.iloc[:, 2:].values.tolist(),
    }
)

In [5]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train = Dataset.from_pandas(data).train_test_split(test_size=0.2)
test_val = train['test'].train_test_split(test_size=0.5)

# Load the Jigsaw Toxic Comment Classification dataset
jigsaw = DatasetDict(
    {"train": train["train"], "validation": test_val["train"], "test": test_val["test"]}
)

In [3]:
# Load the pre-trained transformer model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = AutoModel.from_pretrained(model_name)

In [4]:
# Freeze the pre-trained model parameters
for param in transformer_model.parameters():
    param.requires_grad = False

In [6]:
# Set up the data collator and dataloaders
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

max_length = 512
batch_size = 32

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=max_length)


small_train_dataset = (jigsaw["train"].shuffle(seed=42).select([i for i in list(range(3000))]))
small_eval_dataset = (jigsaw["validation"].shuffle(seed=42).select([i for i in list(range(300))]))
small_test_dataset = jigsaw["test"].select([i for i in list(range(300, 600))])

tokenized_train = small_train_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_eval = small_eval_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_test = small_test_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_eval, batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_test, batch_size=batch_size, collate_fn=data_collator)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [7]:
# Move the model to the GPU (if available)
device = "mps" if torch.backends.mps.is_available() else "cpu"
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [10]:
# Define the custom classification head
class ClassificationHead(nn.Module):
    def __init__(self, transformer_model, num_classes):
        super().__init__()
        self.transformer_model = transformer_model.to(device)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(transformer_model.config.hidden_size, num_classes).to(device)

    def forward(self, input_ids, attention_mask):
        output = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)[0]
        output = self.dropout(output[:, 0])  # Take the CLS token representation
        output = self.classifier(output)
        return output

    # def __init__(self, transformer_model, num_classes):
    #     super().__init__()
    #     self.transformer_model = transformer_model.to(device)
    #     self.pre_classifier = torch.nn.Linear(768, 768)
    #     self.dropout = torch.nn.Dropout(0.1)
    #     self.classifier = torch.nn.Linear(768, 6)

    # def forward(self, input_ids, attention_mask, token_type_ids):
    #     output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
    #     hidden_state = output_1[0]
    #     pooler = hidden_state[:, 0]
    #     pooler = self.pre_classifier(pooler)
    #     pooler = torch.nn.Tanh()(pooler)
    #     pooler = self.dropout(pooler)
    #     output = self.classifier(pooler)
    #     return output

In [11]:
# Set hyperparameters
num_classes = 6
learning_rate = 2e-5
num_epochs = 3

In [14]:
# Create the classification model
model = ClassificationHead(transformer_model, num_classes)

# Set up the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets.float())

In [16]:
# Train
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device, dtype=torch.float32)
        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    eval_loss = 0
    correct_predictions = 0
    total_samples = 0
    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device, dtype=torch.float32)
        output = model(input_ids, attention_mask)
        loss = loss_fn(output, labels)
        eval_loss += loss.item()
        correct_predictions += ((output.sigmoid() > 0.5) == labels).sum().item()
        total_samples += labels.size(0) * labels.size(1)

    # Show metrics
    accuracy = correct_predictions / total_samples
    print(
        f"Epoch {epoch+1}/{num_epochs}, Eval Loss: {eval_loss/len(eval_dataloader)}, Accuracy: {accuracy:.4f}"
    )

Epoch 1/3, Eval Loss: 0.4738864004611969, Accuracy: 0.9617
Epoch 2/3, Eval Loss: 0.40580434203147886, Accuracy: 0.9617
Epoch 3/3, Eval Loss: 0.35311581790447233, Accuracy: 0.9617
