In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import re

In [None]:
# URL keyword extraction function (same as in training)
def url_to_semantics(text):
    """
    Extract meaningful keywords from URLs in text.
    """
    if not isinstance(text, str):
        return ""
    
    # Regular expression to match URLs
    url_pattern = re.compile(
        r'https?://'  # http:// or https://
        r'(?:www\.)?'  # optional www.
        r'([^/?]+)'  # domain (group 1)
        r'(?:[^/]*)'  # optional TLD and port
        r'(/[^/?]*)'  # path (group 2)
    )
    
    urls = url_pattern.findall(text)
    
    if not urls:
        return ""
    
    keywords = []
    
    for domain, path in urls:
        # Clean domain: remove TLD and common prefixes
        domain = domain.split('.')[0]  # Take first part before dot
        if domain and domain not in ['www', 'http', 'https']:
            keywords.append(f"domain:{domain}")
        
        # Clean path: remove leading slash and split
        if path and len(path) > 1:  # Ensure path is not just "/"
            path_parts = path.strip('/').split('/')
            for part in path_parts:
                # Skip empty parts, numbers, or very short parts
                if part and not part.isdigit() and len(part) > 2:
                    # Skip common file extensions
                    if part.lower() not in ['jpg', 'jpeg', 'png', 'gif', 'html', 'php', 'asp', 'aspx']:
                        keywords.append(f"path:{part}")
                        break  # Only take the first meaningful path part
    
    # Remove duplicates while preserving order
    seen = set()
    unique_keywords = []
    for kw in keywords:
        if kw not in seen:
            seen.add(kw)
            unique_keywords.append(kw)
    
    if unique_keywords:
        return "URL Keywords: " + " ".join(unique_keywords)
    else:
        return ""

In [None]:
# Load test data
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

# Apply URL keyword extraction
test_df["body_with_url"] = test_df["body"].apply(lambda x: x + " " + url_to_semantics(x))

# Create combined text with rule and body_with_url
test_df['combined_text'] = test_df["rule"] + "[SEP]" + test_df["body_with_url"]

In [None]:
# Create test dataset
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
        }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/modernbert-base-trained/pytorch/default/3/model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/modernbert-base-trained/pytorch/default/3/model")

In [None]:
# Create test dataset
test_dataset = TestDataset(test_df['combined_text'].tolist(), tokenizer)

# Prediction function
def predict_with_model(model, test_dataset, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    all_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # For binary classification, get probability of positive class
            probs = torch.softmax(logits, dim=-1)[:, 1].cpu().numpy()
            all_predictions.extend(probs)
    
    return all_predictions

# Get predictions
predictions = predict_with_model(model, test_dataset)

In [None]:
# Create submission file
submission = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")