In [1]:
import os
import gc
import random
import torch
import pandas as pd
from scipy.special import softmax
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import warnings
warnings.filterwarnings('ignore')

2025-09-10 00:43:54.526639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757465034.719939      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757465034.774793      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load the tokenizer
model_path = "/kaggle/input/distilbert-uncased/transformers/default/1/distilbert-base-uncased"
model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/distilbert-uncased/transformers/default/1/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# --- CONFIG ---
TRAIN_PATH = '/kaggle/input/jigsaw-agile-community-rules/train.csv'
TEST_PATH = '/kaggle/input/jigsaw-agile-community-rules/test.csv'

N_FOLDS = 5
N_RANDOM_SEARCH = 5   # how many random configs to try

# Possible hyperparameters to search over
HPARAM_SPACE = {
    "learning_rate": [5e-6, 1e-5, 2e-5, 3e-5],
    "num_train_epochs": [3, 5, 8, 10],
    "per_device_train_batch_size": [8, 16],
    "per_device_eval_batch_size": [2, 4, 8],
    "gradient_accumulation_steps": [1,2],
    "weight_decay": [0.0, 0.01, 0.05],
    "lr_scheduler_type": ["linear", "cosine"],
}

In [4]:
# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text_pair"], truncation=True)

def expand(df, tokenizer, train=True):
    """
    Vectorized expansion of original + positive + negative examples
    """
    sep = tokenizer.sep_token

    # Positive examples
    pos_cols = [f"positive_example_{i}" for i in range(1, 3)]
    pos_df = df[["rule"] + pos_cols].copy()
    
    # Melt to long format
    pos_df = pos_df.melt(id_vars="rule", value_vars=pos_cols, value_name="example")
    pos_df = pos_df.dropna(subset=["example"])
    pos_df = pos_df[pos_df["example"].str.strip() != ""]
    pos_df["text_pair"] = pos_df["example"].str.strip() + sep + pos_df["rule"].str.strip() + sep
    pos_df["labels"] = 1
    pos_df = pos_df[["text_pair", "labels"]]

    # Negative examples
    neg_cols = [f"negative_example_{i}" for i in range(1, 3)]
    neg_df = df[["rule"] + neg_cols].copy()
    neg_df = neg_df.melt(id_vars="rule", value_vars=neg_cols, value_name="example")
    neg_df = neg_df.dropna(subset=["example"])
    neg_df = neg_df[neg_df["example"].str.strip() != ""]
    neg_df["text_pair"] = neg_df["example"].str.strip() + sep + neg_df["rule"].str.strip() + sep
    neg_df["labels"] = 0
    neg_df = neg_df[["text_pair", "labels"]]

    if train:
        # Original examples
        original = pd.DataFrame({
            "text_pair": df["body"].str.strip() + sep + df["rule"].str.strip() + sep,
            "labels": df["rule_violation"].astype(int),
        })
    
        # Concatenate all
        expanded_df = pd.concat([original, pos_df, neg_df], ignore_index=True)

    else:
        expanded_df = pd.concat([pos_df, neg_df], ignore_index=True)
        
    return expanded_df

def sample_hparams(space):
    """Randomly sample a hyperparameter config."""
    return {k: random.choice(v) for k, v in space.items()}

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

exp_train = expand(train_df, tokenizer)
exp_test = expand(test_df, tokenizer, False)

df = pd.concat([exp_train, exp_test], ignore_index=True)

# --- Stratified K-FOLD SPLIT ---
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

y = df["labels"]

# Convert df to HF dataset before looping
full_dataset = Dataset.from_pandas(df)

best_score = -np.inf
best_config = None

In [6]:
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
    
#     # Convert logits to probabilities
#     probabilities = softmax(predictions, axis=1)
    
#     # Column AUC
#     auc_scores = {}    
#     # AUC
#     try:
#         overall_auc = roc_auc_score(labels, probabilities[:, 1])
#         auc_scores['overall_auc'] = overall_auc
#     except ValueError:
#         auc_scores['overall_auc'] = 0.0

#     return auc_scores

# from transformers import EarlyStoppingCallback

# # Define the callback
# early_stopping = EarlyStoppingCallback(
#     early_stopping_patience=2,   # stop after N eval checks with no improvement
#     early_stopping_threshold=0.01, # min improvement to reset patience
# )

In [7]:
# for trial in range(N_RANDOM_SEARCH):
#     hparams = sample_hparams(HPARAM_SPACE)
#     print(f"\n Trial {trial+1}/{N_RANDOM_SEARCH}, hparams={hparams}")

#     fold_scores = []

#     for fold, (train_idx, val_idx) in enumerate(skf.split(df, y)):
#         train_dataset = full_dataset.select(train_idx.tolist())
#         val_dataset = full_dataset.select(val_idx.tolist())
        
#         # Tokenize
#         tokenized_train = train_dataset.map(tokenize_function, batched=True)
#         tokenized_val = val_dataset.map(tokenize_function, batched=True)

#         print(f"Fold {fold}: {len(train_dataset)} train / {len(val_dataset)} val")
        
#         # Model
#         model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=2)

#         # Training args
#         training_args = TrainingArguments(
#             output_dir=f"./results/trial{trial}_fold{fold}",
#             eval_strategy="epoch",
#             save_strategy="no",     # don’t save per fold
#             report_to="none",
#             logging_steps=10,
#             metric_for_best_model="overall_auc",  
#             greater_is_better=True,
#             **hparams
#         )

#         trainer = Trainer(
#             model=model,
#             args=training_args,
#             train_dataset=tokenized_train,
#             eval_dataset=tokenized_val,
#             compute_metrics=compute_metrics,
#             processing_class=tokenizer,
#             callbacks=[early_stopping]
#         )
#         result = trainer.train()
#         metrics = trainer.evaluate()
#         fold_scores.append(metrics["eval_overall_auc"])

#         print(f"  Fold {fold} AUC = {metrics['eval_overall_auc']:.4f}")

#     mean_score = np.mean(fold_scores)
#     print(f" Trial {trial+1} mean AUC = {mean_score:.4f}")

#     # Track best
#     if mean_score > best_score:
#         best_score = mean_score
#         best_config = hparams

# print("\n Best config:", best_config, "with mean AUC:", best_score)

In [8]:
# # Best config
# config_df = pd.DataFrame([best_config])
# config_df.to_csv('best_params.csv', index=False)

In [9]:
full_dataset = full_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    num_train_epochs=3,      
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    weight_decay=0.0,
    logging_steps=10,
    report_to="none",  
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    processing_class=tokenizer,
)

# Train
trainer_output = trainer.train()

Map:   0%|          | 0/10185 [00:00<?, ? examples/s]

Step,Training Loss
10,0.6921
20,0.6987
30,0.6912
40,0.6886
50,0.688
60,0.6861
70,0.6769
80,0.6679
90,0.6556
100,0.6127


In [10]:
model.save_pretrained("/kaggle/working/bert_finetuned-jigsaw")
tokenizer.save_pretrained("/kaggle/working/bert_finetuned-jigsaw")

('/kaggle/working/bert_finetuned-jigsaw/tokenizer_config.json',
 '/kaggle/working/bert_finetuned-jigsaw/special_tokens_map.json',
 '/kaggle/working/bert_finetuned-jigsaw/vocab.txt',
 '/kaggle/working/bert_finetuned-jigsaw/added_tokens.json')

In [11]:
# test_df['text_pair'] = test_df['body'] + tokenizer.sep_token + test_df['rule'] + tokenizer.sep_token
# ds_test = Dataset.from_pandas(test_df)
# ds_test = ds_test.map(tokenize_function, batched=True)
# ds_test = ds_test.remove_columns(["body", "rule", "subreddit", "positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"])

In [12]:
# model.eval()

# # make preds
# predictions = trainer.predict(ds_test)
# probabilities = torch.nn.functional.softmax(torch.from_numpy(predictions.predictions), dim=1)[:, 1].numpy()

# # save csv
# submission_df = pd.DataFrame({'row_id': test_df['row_id'], 'rule_violation': probabilities})
# submission_df.to_csv('submission.csv', index=False)