In [1]:
# Add this to debug the submission environment
import torch, torchvision, transformers
print(f"PyTorch: {torch.__version__}")
print(f"Torchvision: {torchvision.__version__}")  
print(f"Transformers: {transformers.__version__}")

PyTorch: 2.6.0+cu124
Torchvision: 0.21.0+cu124
Transformers: 4.57.0.dev0


In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from cleantext import clean

2025-09-10 05:06:26.927032: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757480787.133787      44 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757480787.192757      44 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Config
TRAIN_PATH = '/kaggle/input/jigsaw-agile-community-rules/train.csv'
TEST_PATH = '/kaggle/input/jigsaw-agile-community-rules/test.csv'

In [4]:
# Helper function
def expand(df, train=True):
    """
    Vectorized expansion of original + positive + negative examples
    """
    # Positive examples
    pos_cols = [f"positive_example_{i}" for i in range(1, 3)]
    pos_df = df[["rule"] + pos_cols].copy()
    
    # Melt to long format
    pos_df = pos_df.melt(id_vars="rule", value_vars=pos_cols, value_name="example")
    pos_df = pos_df.dropna(subset=["example"])
    pos_df = pos_df[pos_df["example"].str.strip() != ""]
    pos_df["text"] = pos_df["example"].str.strip()
    pos_df["labels"] = 1
    pos_df = pos_df[["text", "labels"]]

    # Negative examples
    neg_cols = [f"negative_example_{i}" for i in range(1, 3)]
    neg_df = df[["rule"] + neg_cols].copy()
    neg_df = neg_df.melt(id_vars="rule", value_vars=neg_cols, value_name="example")
    neg_df = neg_df.dropna(subset=["example"])
    neg_df = neg_df[neg_df["example"].str.strip() != ""]
    neg_df["text"] = neg_df["example"].str.strip()
    neg_df["labels"] = 0
    neg_df = neg_df[["text", "labels"]]

    if train:
        # Original examples
        original = pd.DataFrame({
            "text": df["body"].str.strip(),
            "labels": df["rule_violation"].astype(int),
        })
    
        # Concatenate all
        expanded_df = pd.concat([original, pos_df, neg_df], ignore_index=True)

    else:
        expanded_df = pd.concat([pos_df, neg_df], ignore_index=True)
        
    return expanded_df

def cleaner(text):
    return clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=False,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )

In [5]:
# Load and melt data
df1 = pd.read_csv(TRAIN_PATH)
df2 = pd.read_csv(TEST_PATH)

exp_train = expand(df1)
exp_test = expand(df2, False)

df = pd.concat([exp_train, exp_test], ignore_index=True)

# Clean data
tqdm.pandas(desc="cleaner")
df['text'] = df['text'].progress_apply(cleaner)

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

cleaner:   0%|          | 0/10185 [00:00<?, ?it/s]

In [6]:
# Load model
model = SentenceTransformer('/kaggle/input/embeddinggemma/transformers/embeddinggemma-300m/1')

# # Encode training examples as 'documents'
# train_embs = model.encode(
#     train_df["text"].tolist(),
#     task="document",
#     normalize_embeddings=True,
#     convert_to_numpy=True
# )

# # Encode test/validation as 'queries'
# val_embs = model.encode(
#     val_df["text"].tolist(),
#     task="query",
#     normalize_embeddings=True,
#     convert_to_numpy=True
# )

In [7]:
# # Fit Nearest Neighbors on training embeddings
# k = 8
# nn = NearestNeighbors(n_neighbors=k, metric="cosine")
# nn.fit(train_embs)

# # Predict probabilities for test set
# distances, indices = nn.kneighbors(val_embs, n_neighbors=k, return_distance=True)
# sims = 1 - distances
# neighbor_labels = train_df.iloc[indices.flatten()]["labels"].values.reshape(indices.shape)
# probs = (neighbor_labels * sims).sum(axis=1) / sims.sum(axis=1)

# # Evaluate
# y_true = val_df["labels"].values
# auc = roc_auc_score(y_true, probs)

# print("Predicted probs:", probs[:5])
# print("AUC:", auc)

In [8]:
# # Tune k
# candidate_k = [1,3,5,10,15,20,30]
# results = []
# for k in tqdm(candidate_k, desc='k-values'):
#     nn = NearestNeighbors(n_neighbors=k, metric="cosine")
#     nn.fit(train_embs)

#     distances, indices = nn.kneighbors(val_embs, n_neighbors=k, return_distance=True)
#     sims = 1 - distances
#     neighbor_labels = train_df.iloc[indices.flatten()]["labels"].values.reshape(indices.shape)
#     probs = (neighbor_labels * sims).sum(axis=1) / sims.sum(axis=1)

#     auc = roc_auc_score(val_df["labels"].values, probs)
#     results.append((k, auc))
#     print(f"  k={k}: AUC={auc:.4f}")

# best_k, best_auc = max(results, key=lambda x: x[1])
# print(f"\nBest k: {best_k} | AUC: {best_auc:.4f}")

In [9]:
# Prep test data
tqdm.pandas(desc="cleaner")
df2['text'] = df2['body'].progress_apply(cleaner)

# Generate full embeddings
full_embs = model.encode(
    df["text"].tolist(),
    task="document",
    normalize_embeddings=True,
    convert_to_numpy=True
)
test_embs = model.encode(
    df2['text'].tolist(),
    task="query",
    normalize_embeddings=True,
    convert_to_numpy=True
)

# knn init
k = 10  # number of neighbors to consider
nn = NearestNeighbors(n_neighbors=k, metric="cosine")
nn.fit(full_embs)

distances, indices = nn.kneighbors(test_embs, n_neighbors=k, return_distance=True)
sims = 1 - distances
neighbor_labels = df.iloc[indices.flatten()]["labels"].values.reshape(indices.shape)
probs = (neighbor_labels * sims).sum(axis=1) / sims.sum(axis=1)

cleaner:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/319 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
submission_df = pd.DataFrame({'row_id': df2['row_id'], 'rule_violation': probs})
submission_df.to_csv('submission.csv', index=False)