In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/modernbert-base-trained/pytorch/default/1/model/config.json
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/trainer_state.json
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/training_args.bin
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/tokenizer.json
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/tokenizer_config.json
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/scaler.pt
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/scheduler.pt
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/model.safetensors
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/special_tokens_map.json
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/optimizer.pt
/kaggle/input/modernbert-base-trained/pytorch/default/1/model/rng_state.pth
/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-a

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

In [3]:
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

# Combine text fields for each sample
def combine_text_fields(row):
    return f"Comment: {row['body']}\nRule: {row['rule']}\nPositive Example 1: {row['positive_example_1']}\nPositive Example 2: {row['positive_example_2']}\nNegative Example 1: {row['negative_example_1']}\nNegative Example 2: {row['negative_example_2']}"

test_df['combined_text'] = test_df.apply(combine_text_fields, axis=1)

In [4]:
# Create test dataset without labels for prediction
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
        }

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/modernbert-base-trained/pytorch/default/1/model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/modernbert-base-trained/pytorch/default/1/model")

2025-09-27 14:48:35.758717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758984515.969364      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758984516.032260      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# Create test dataset without labels
test_dataset = TestDataset(test_df['combined_text'].tolist(), tokenizer)

# Custom prediction function
def predict_with_model(model, test_dataset, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    all_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            probs = torch.sigmoid(logits).cpu().numpy()
            all_predictions.extend(probs.flatten())
    
    return all_predictions

# Get predictions
predictions = predict_with_model(model, test_dataset)

Predicting: 100%|██████████| 1/1 [00:18<00:00, 18.69s/it]


In [7]:
# Create submission file
submission = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!
