# Task 1: Quora Insincere Questions Classification
Fine-tuning DeBERTa-v3-base for binary classification of toxic/insincere questions.

In [None]:
!uv pip install transformers datasets
!uv pip install 'accelerate>=0.26.0'
!uv pip install sentencepiece

[2mUsing Python 3.12.11 environment at: /home/zeus/miniconda3/envs/cloudspace[0m
[2mAudited [1m2 packages[0m [2min 9ms[0m[0m
[2mUsing Python 3.12.11 environment at: /home/zeus/miniconda3/envs/cloudspace[0m
[2K[2mResolved [1m39 packages[0m [2min 269ms[0m[0m                                        [0m
[2K[2mPrepared [1m1 package[0m [2min 68ms[0m[0m                                               
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m1 package[0m [2min 25ms[0m[0m                                 [0m
 [32m+[39m [1maccelerate[0m[2m==1.12.0[0m


In [None]:
!unzip  "Language Challenge.zip"

In [None]:
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import Dataset
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, precision_score, recall_score
import torch
from transformers import Trainer
from torch import nn
from datasets import ClassLabel

In [None]:
train = pd.read_csv(
    "Language Challenge/quora-insincere-questions-classification/train.csv"
)
test = pd.read_csv(
    "Language Challenge/quora-insincere-questions-classification/test.csv"
)

## Data Preprocessing & Tokenization
Compute class weights for imbalanced data and tokenize using DeBERTa tokenizer.

In [None]:
class_weights = compute_class_weight(
    "balanced", classes=np.array([0, 1]), y=train["target"].values
)

class_weights = torch.tensor(class_weights, dtype=torch.float)

print(f"Class weights: {class_weights}")

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")


def tokenize_function(examples):
    return tokenizer(
        examples["question_text"], truncation=True, padding=False, max_length=256
    )


dataset = Dataset.from_pandas(train[["question_text", "target"]])
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.rename_column("target", "labels")
dataset = dataset.cast_column("labels", ClassLabel(names=["non_toxic", "toxic"]))

split_dataset = dataset.train_test_split(
    test_size=0.1, stratify_by_column="labels", seed=42
)

Class weights: tensor([0.5330, 8.0814])




Map:   0%|          | 0/1306122 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1306122 [00:00<?, ? examples/s]

## Model Training
Custom WeightedTrainer with class-weighted cross-entropy loss to handle class imbalance.

In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(
        self, model, inputs, return_outputs=False, num_items_in_batch=None
    ):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base", num_labels=2
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./deberta-tuned",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=5000,
    save_strategy="steps",
    save_steps=5000,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=100,
    warmup_steps=1000,
    fp16=True,
)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "f1": f1_score(labels, predictions, average="binary"),
        "f1_macro": f1_score(labels, predictions, average="macro"),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions),
    }


trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.class_weights = class_weights.to(trainer.args.device)

trainer.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss,Validation Loss,F1,F1 Macro,Precision,Recall
5000,0.25,0.261422,0.696927,0.836375,0.599483,0.832199
10000,0.223,0.220058,0.662574,0.815779,0.523762,0.901497
15000,0.241,0.246188,0.707477,0.841862,0.601491,0.858805
20000,0.1692,0.255871,0.721434,0.849776,0.627044,0.849276
25000,0.1669,0.250423,0.714278,0.845477,0.605209,0.871303
30000,0.1585,0.267364,0.728048,0.853367,0.633804,0.855216
35000,0.185,0.232511,0.721335,0.849405,0.615103,0.871922


TrainOutput(global_step=36736, training_loss=0.2102616144026198, metrics={'train_runtime': 6630.3565, 'train_samples_per_second': 354.584, 'train_steps_per_second': 5.541, 'total_flos': 5.903708433069824e+16, 'train_loss': 0.2102616144026198, 'epoch': 2.0})

## Inference
Load trained model and run predictions on the dataset.

In [None]:
MODEL_DIR = "deberta-task1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

device = "cuda" if torch.cuda.is_available() else "cpu"

model = model.to(device)
model = torch.compile(model)
model.eval()


The tokenizer you are loading from 'deberta-tuned/checkpoint-30000' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


OptimizedModule(
  (_orig_mod): DebertaV2ForSequenceClassification(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 768, padding_idx=0)
        (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0-11): 12 x DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): Linear(in_features=768, out_features=768, bias=True)
                (key_proj): Linear(in_features=768, out_features=768, bias=True)
                (value_proj): Linear(in_features=768, out_features=768, bias=True)
                (pos_dropout): Dropout(p=0.1, inplace=False)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): DebertaV2SelfOutput(
                (dense): Linear(in_featu

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch


test_ds = Dataset.from_pandas(train[["question_text"]])


def tok_fn(batch):
    return tokenizer(batch["question_text"], truncation=True, max_length=256)


test_ds = test_ds.map(tok_fn, batched=True, remove_columns=["question_text"])

collator = DataCollatorWithPadding(tokenizer=tokenizer)
loader = DataLoader(test_ds, batch_size=256, shuffle=False, collate_fn=collator)

all_logits = []
with torch.no_grad():
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        all_logits.append(logits.cpu())

logits = torch.cat(all_logits, dim=0).numpy()
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
pred_label = np.argmax(logits, axis=1)
p_toxic = probs[:, 1]

train["pred_label"] = pred_label
train["p_toxic"] = p_toxic


Map:   0%|          | 0/1306122 [00:00<?, ? examples/s]

  0%|          | 0/5103 [00:00<?, ?it/s]W0113 06:23:50.074000 10671 /system/conda/miniconda3/envs/cloudspace/lib/python3.12/site-packages/torch/_inductor/utils.py:1436] [0/0] Not enough SMs to use max_autotune_gemm mode
100%|██████████| 5103/5103 [16:20<00:00,  5.20it/s]


In [None]:
submission = train[["qid", "pred_label"]].copy()
submission.columns = ["qid", "prediction"]
submission.to_csv("task1_pred.csv", index=False)

## Evaluation & Threshold Tuning
Evaluate model performance and find optimal classification threshold.

In [None]:
y_true = train["target"].values
y_pred = train["pred_label"].values

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)


Precision: 0.6842662383881153
Recall: 0.9279297116693479
F1: 0.7876846152230089


In [None]:
y_true = train["target"].values
probs = train["p_toxic"].values

best_f1 = 0
best_t = 0
best_precision = 0
best_recall = 0
for t in np.linspace(0.05, 0.95, 50):
    f1 = f1_score(y_true, probs > t)
    precision = precision_score(y_true, probs > t)
    recall = recall_score(y_true, probs > t)
    if f1 > best_f1:
        best_t = t
        best_f1 = f1
        best_precision = precision
        best_recall = recall

print("Best threshold:", best_t)
print("Best F1:", best_f1)
print("Best Precision:", best_precision)
print("Best Recall:", best_recall)

Best threshold: 0.8581632653061224
Best F1: 0.8083354742941965
Best Precision: 0.7503524299114951
Best Recall: 0.8760301942828858
