In [1]:
!pip install transformers datasets accelerate torch

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-1

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from accelerate import PartialState
from transformers import EarlyStoppingCallback

2025-09-28 13:54:56.986876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759067697.325496      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759067697.420897      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
distributed_state = PartialState()

In [5]:
# Load data
train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')

# Combine text fields for each sample
def combine_text_fields(row):
    return f"Comment: {row['body']}\nRule: {row['rule']}\nPositive Example 1: {row['positive_example_1']}\nPositive Example 2: {row['positive_example_2']}\nNegative Example 1: {row['negative_example_1']}\nNegative Example 2: {row['negative_example_2']}"

train_df['combined_text'] = train_df.apply(combine_text_fields, axis=1)

# Prepare data for k-fold
texts = train_df['combined_text'].tolist()
labels = train_df['rule_violation'].tolist()

In [6]:
class RedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=1024):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [7]:
# Initialize tokenizer
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [8]:
config = AutoConfig.from_pretrained(model_name)
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.3
config.num_labels = 2
config.problem_type = "single_label_classification"

config.json: 0.00B [00:00, ?B/s]

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
    auc = roc_auc_score(labels, probs)
    return {"auc": auc}

In [10]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.training_losses = []
        self.validation_losses = []
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        labels = labels.long()
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        
        self.training_losses.append(loss.item())
        
        return (loss, outputs) if return_outputs else loss
    
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_output = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        
        if "eval_loss" in eval_output:
            self.validation_losses.append(eval_output["eval_loss"])
        
        return eval_output

In [11]:
# Analysis function
def analyze_kfold_performance(all_fold_results, train_texts_fold, training_args):
    """
    Analyze model performance across k-fold cross-validation to detect underfitting/overfitting
    """
    print("\n" + "="*60)
    print("K-FOLD CROSS-VALIDATION ANALYSIS")
    print("="*60)
    
    # Extract AUC scores
    auc_scores = [result['auc'] for result in all_fold_results]
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    
    print(f"Mean AUC: {mean_auc:.4f} ± {std_auc:.4f}")
    print(f"Individual Fold AUCs: {[f'{auc:.4f}' for auc in auc_scores]}")
    
    # Analyze loss curves
    max_epochs = max(len(result['val_loss']) for result in all_fold_results)
    avg_train_loss = np.zeros(max_epochs)
    avg_val_loss = np.zeros(max_epochs)
    epoch_counts = np.zeros(max_epochs)
    
    # Calculate average losses per epoch
    for result in all_fold_results:
        train_losses = result['train_loss']
        val_losses = result['val_loss']
        
        # Group training losses by epoch
        epoch_length = len(train_texts_fold) // training_args.per_device_train_batch_size
        train_losses_per_epoch = [train_losses[i:i+epoch_length] for i in range(0, len(train_losses), epoch_length)]
        
        for epoch, (train_epoch_losses, val_loss) in enumerate(zip(train_losses_per_epoch, val_losses)):
            if epoch < max_epochs:
                avg_train_loss[epoch] += np.mean(train_epoch_losses)
                avg_val_loss[epoch] += val_loss
                epoch_counts[epoch] += 1
    
    # Calculate averages
    for epoch in range(max_epochs):
        if epoch_counts[epoch] > 0:
            avg_train_loss[epoch] /= epoch_counts[epoch]
            avg_val_loss[epoch] /= epoch_counts[epoch]
    
    # Find best epoch
    best_epoch = np.argmin(avg_val_loss)
    best_train_loss = avg_train_loss[best_epoch]
    best_val_loss = avg_val_loss[best_epoch]
    
    print(f"\nBest Performance at Epoch {best_epoch+1}:")
    print(f"Training Loss: {best_train_loss:.4f}")
    print(f"Validation Loss: {best_val_loss:.4f}")
    print(f"Loss Gap (Val - Train): {best_val_loss - best_train_loss:.4f}")
    
    # Analyze underfitting/overfitting
    print("\n" + "-"*60)
    print("MODEL STATUS ANALYSIS")
    print("-"*60)
    
    # Check for underfitting
    if best_train_loss > 0.5 and best_val_loss > 0.5:
        print("Status: UNDERFITTING")
        print("Reasons: Both training and validation losses are high")
        print("Recommendations:")
        print("- Increase model capacity (use larger model)")
        print("- Train for more epochs")
        print("- Reduce regularization")
        print("- Increase learning rate")
    
    # Check for overfitting
    elif (best_val_loss - best_train_loss) > 0.1:
        print("Status: OVERFITTING")
        print("Reasons: Validation loss is significantly higher than training loss")
        print("Recommendations:")
        print("- Add more dropout")
        print("- Increase weight decay")
        print("- Use early stopping with lower patience")
        print("- Add data augmentation")
        print("- Reduce model complexity")
    
    # Check for good fit
    elif best_train_loss < 0.4 and best_val_loss < 0.4 and abs(best_val_loss - best_train_loss) < 0.05:
        print("Status: GOOD FIT")
        print("Reasons: Both losses are low and close to each other")
        print("Recommendations:")
        print("- Continue monitoring for overfitting")
        print("- Consider fine-tuning hyperparameters")
        print("- Try ensemble methods for further improvement")
    
    # Check for under-optimized
    else:
        print("Status: UNDER-OPTIMIZED")
        print("Reasons: Model has potential but needs more optimization")
        print("Recommendations:")
        print("- Train for more epochs")
        print("- Adjust learning rate")
        print("- Try different optimizers")
        print("- Tune hyperparameters more carefully")
    
    # Analyze consistency across folds
    print("\n" + "-"*60)
    print("FOLD CONSISTENCY ANALYSIS")
    print("-"*60)
    
    if std_auc < 0.02:
        print("Consistency: VERY HIGH")
        print("The model performs consistently across all folds")
    elif std_auc < 0.05:
        print("Consistency: HIGH")
        print("The model performs quite consistently across folds")
    elif std_auc < 0.1:
        print("Consistency: MODERATE")
        print("There's some variation in performance across folds")
    else:
        print("Consistency: LOW")
        print("Performance varies significantly across folds")
        print("Recommendations:")
        print("- Check for data distribution issues")
        print("- Consider stratified sampling")
        print("- Increase training data")
    
    # Plot loss curves
    plt.figure(figsize=(12, 6))
    sns.set(style="whitegrid")
    
    epochs = range(1, max_epochs + 1)
    sns.lineplot(x=epochs, y=avg_train_loss, label='Average Training Loss', marker='o', linewidth=2)
    sns.lineplot(x=epochs, y=avg_val_loss, label='Average Validation Loss', marker='s', linewidth=2)
    
    # Mark best epoch
    plt.axvline(x=best_epoch+1, color='red', linestyle='--', alpha=0.7, label=f'Best Epoch ({best_epoch+1})')
    
    plt.title('Average Training vs Validation Loss Across Folds', fontsize=16)
    plt.xlabel('Epoch', fontsize=14)
    plt.ylabel('Loss', fontsize=14)
    plt.xticks(epochs)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig('./loss_analysis.png')
    plt.show()
    
    # Plot individual fold performances
    plt.figure(figsize=(12, 6))
    sns.set(style="whitegrid")
    
    fold_nums = [result['fold'] for result in all_fold_results]
    fold_aucs = [result['auc'] for result in all_fold_results]
    
    sns.barplot(x=fold_nums, y=fold_aucs, palette='viridis')
    plt.axhline(y=mean_auc, color='red', linestyle='--', label=f'Mean AUC ({mean_auc:.4f})')
    
    plt.title('AUC Scores Across Folds', fontsize=16)
    plt.xlabel('Fold', fontsize=14)
    plt.ylabel('AUC', fontsize=14)
    plt.ylim(0.5, 1.0)  # AUC range
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig('./fold_performance.png')
    plt.show()
    
    return {
        'mean_auc': mean_auc,
        'std_auc': std_auc,
        'best_epoch': best_epoch,
        'best_train_loss': best_train_loss,
        'best_val_loss': best_val_loss,
        'loss_gap': best_val_loss - best_train_loss
    }

In [12]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Track best model
best_auc = 0
best_fold = None
best_model_path = None

# Store results for analysis
all_fold_results = []

# Create output directory
os.makedirs("./results", exist_ok=True)

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"\n{'='*50}")
    print(f"Training Fold {fold+1}/{n_splits}")
    print(f"{'='*50}")
    
    # Split data for this fold
    train_texts_fold = [texts[i] for i in train_idx]
    train_labels_fold = [labels[i] for i in train_idx]
    val_texts_fold = [texts[i] for i in val_idx]
    val_labels_fold = [labels[i] for i in val_idx]
    
    # Create datasets
    train_dataset = RedditDataset(train_texts_fold, train_labels_fold, tokenizer)
    val_dataset = RedditDataset(val_texts_fold, val_labels_fold, tokenizer)
    
    # Initialize model for this fold
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config
    )
    
    # Set up output directory for this fold
    output_dir = f"./results/fold_{fold}"
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        learning_rate=1e-5,
        warmup_ratio=0.1,
        weight_decay=0.1,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="best",
        save_total_limit=1,   
        load_best_model_at_end=True,
        metric_for_best_model="auc",
        greater_is_better=True,
        report_to="none",
        fp16=True,
        bf16=False,
        local_rank=-1,
        ddp_find_unused_parameters=False,
    )
    
    # Initialize trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate to get the AUC for this fold
    eval_result = trainer.evaluate()
    fold_auc = eval_result["eval_auc"]
    print(f"Fold {fold+1} AUC: {fold_auc:.4f}")
    
    # Store results
    all_fold_results.append({
        'fold': fold+1,
        'auc': fold_auc,
        'train_loss': trainer.training_losses,
        'val_loss': trainer.validation_losses
    })
    
    # If this fold's AUC is the best, save this model
    if fold_auc > best_auc:
        best_auc = fold_auc
        best_fold = fold
        
        # Remove previous best model if exists
        if best_model_path and os.path.exists(best_model_path):
            shutil.rmtree(best_model_path)
        
        # Save the best model
        best_model_path = "./best_model"
        trainer.save_model(best_model_path)
        tokenizer.save_pretrained(best_model_path)
        
        print(f"New best model found! Fold {fold+1} with AUC: {fold_auc:.4f}")
    
    # Clean up fold directory to save space
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)

print(f"\n{'='*50}")
print(f"K-Fold Cross-Validation Complete")
print(f"{'='*50}")
print(f"Best Fold: {best_fold+1}")
print(f"Best AUC: {best_auc:.4f}")
print(f"Best model saved to: {best_model_path}")


Training Fold 1/5


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
W0928 13:55:32.499000 36 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Auc
1,0.7005,0.678283,0.679769
2,0.6723,0.614264,0.772646
3,0.5305,0.559631,0.801383
4,0.4317,0.71915,0.824697
5,0.2624,1.094878,0.826456
6,0.1771,1.570823,0.817621


Fold 1 AUC: 0.8176
New best model found! Fold 1 with AUC: 0.8176


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training Fold 2/5


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Auc
1,0.6993,0.685863,0.649417
2,0.6138,0.592927,0.739867
3,0.4849,0.598645,0.775085
4,0.3465,0.731315,0.785813
5,0.2364,1.483401,0.774333


Fold 2 AUC: 0.7743


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training Fold 3/5


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Auc
1,0.7056,0.675268,0.617913
2,0.6296,0.588386,0.774612
3,0.5059,0.552506,0.812779
4,0.3525,0.71807,0.829854


In [None]:
# Analyze performance
analysis_results = analyze_kfold_performance(all_fold_results, train_texts_fold, training_args)

# Print summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Best Model: Fold {best_fold+1}")
print(f"Best AUC: {best_auc:.4f}")
print(f"Mean AUC across folds: {analysis_results['mean_auc']:.4f}")
print(f"Best Epoch: {analysis_results['best_epoch']+1}")
print(f"Loss Gap: {analysis_results['loss_gap']:.4f}")

In [None]:
# Save analysis results
with open('./analysis_results.txt', 'w') as f:
    f.write(f"Best Fold: {best_fold+1}\n")
    f.write(f"Best AUC: {best_auc:.4f}\n")
    f.write(f"Mean AUC: {analysis_results['mean_auc']:.4f}\n")
    f.write(f"Std AUC: {analysis_results['std_auc']:.4f}\n")
    f.write(f"Best Epoch: {analysis_results['best_epoch']+1}\n")
    f.write(f"Best Training Loss: {analysis_results['best_train_loss']:.4f}\n")
    f.write(f"Best Validation Loss: {analysis_results['best_val_loss']:.4f}\n")
    f.write(f"Loss Gap: {analysis_results['loss_gap']:.4f}\n")