In [1]:
# %pip install clean-text 
# %pip install git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import re
import torch
import pandas as pd
from tqdm.auto import tqdm
from cleantext import clean
from urllib.parse import urlparse

2025-09-16 15:52:20.747694: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758037940.978615      45 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758037941.045192      45 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Config
TRAIN_PATH = '/kaggle/input/jigsaw-agile-community-rules/train.csv'
TEST_PATH = '/kaggle/input/jigsaw-agile-community-rules/test.csv'

In [4]:
# Helper function
def make_input(rule, example):
    return f"Rule: {rule}\nComment: {example}"
    
# spam_words = pd.read_csv('/kaggle/input/spam-words/spam-words-EN.txt', header=None, names=['word'], index_col=False)
# words = spam_words['word'].values.tolist()

# SPAM_WORDS = [
#     "sex", "escort", "dating", "hookup", "porn", "cam", "xxx", "nude", "meet", "chat",
#     "buy", "cheap", "discount", "offer", "deal", "price", "order",
#     "free", "join", "sign up", "click", "visit"
# ]

# SPAM_WORDS += words
# URL_RE = re.compile(r'(https?://[^\s]+|www\.[^\s]+)', re.IGNORECASE)

# def extract_domain(text):
#     matches = URL_RE.findall(text)
#     domains = []
#     for m in matches:
#         try:
#             netloc = urlparse(m).netloc
#             if not netloc:  # if parsing fails, skip
#                 continue
#             if netloc.startswith("www."):
#                 netloc = netloc[4:]
#             domains.append(netloc.lower())
#         except Exception:
#             continue
#     return domains
    
# def extract_features(text):
#     text_lower = text.lower()
    
#     feats = {}
#     # feats["has_url"] = int("<URL>" in text or "http" in text)
#     # feats["num_urls"] = text.count("<URL>")
#     # feats["has_email"] = int("<EMAIL>" in text)
#     # feats["has_phone"] = int("<PHONE>" in text)
#     # feats["num_exclaims"] = text.count("!")
#     # feats["num_caps"] = sum(1 for w in text.split() if w.isupper() and len(w) > 1)
#     # feats["len_chars"] = len(text)
#     # feats["len_words"] = len(text.split())
#     # Spam word flags
#     feats["num_spam_words"] = sum(1 for w in SPAM_WORDS if w in text_lower)
#     # feats["has_spam_word"] = int(feats["num_spam_words"] > 0)
#     return feats
    
def expand(df, train=True):
    """
    Vectorized expansion of original + positive + negative examples
    """
    # Positive examples
    pos_cols = [f"positive_example_{i}" for i in range(1, 3)]
    pos_df = df[["rule", "subreddit"] + pos_cols].copy()
    
    # Melt to long format
    pos_df = pos_df.melt(id_vars=["rule", "subreddit"], value_vars=pos_cols, value_name="example")
    pos_df = pos_df.dropna(subset=["example"])
    pos_df = pos_df[pos_df["example"].str.strip() != ""]
    pos_df["text"] = pos_df.apply(lambda r: make_input(r["rule"], r["example"]), axis=1)
    pos_df["label"] = 1
    pos_df = pos_df[["text", "label", "subreddit"]]

    # Negative examples
    neg_cols = [f"negative_example_{i}" for i in range(1, 3)]
    neg_df = df[["rule", "subreddit"] + neg_cols].copy()
    neg_df = neg_df.melt(id_vars=["rule", "subreddit"], value_vars=neg_cols, value_name="example")
    neg_df = neg_df.dropna(subset=["example"])
    neg_df = neg_df[neg_df["example"].str.strip() != ""]
    neg_df["text"] = neg_df.apply(lambda r: make_input(r["rule"], r["example"]), axis=1)
    neg_df["label"] = 0
    neg_df = neg_df[["text", "label", "subreddit"]]

    if train:
        # Original examples
        original = pd.DataFrame({
            "text": df.apply(
                lambda r: f"Rule: {r['rule'].strip()}\nComment: {r['body'].strip()}",
                axis=1
            ),
            "label": df["rule_violation"].astype(int),
            "subreddit": df["subreddit"],
        })
    
        # Concatenate all
        expanded_df = pd.concat([original, pos_df, neg_df], ignore_index=True)

    else:
        expanded_df = pd.concat([pos_df, neg_df], ignore_index=True)

    # # Scan for domains
    # tqdm.pandas(desc="Extracting domains")
    # expanded_df["domains"] = expanded_df["text"].progress_apply(extract_domain)
    
    # Clean text
    tqdm.pandas(desc="Cleaner")
    expanded_df['text'] = expanded_df['text'].progress_apply(cleaner)
    
    # # --- Add engineered features ---
    # feat_dicts = expanded_df["text"].apply(extract_features)
    # feat_df = pd.DataFrame(list(feat_dicts))
    # expanded_df = pd.concat([expanded_df.reset_index(drop=True), feat_df.reset_index(drop=True)], axis=1)

    return expanded_df

def cleaner(text):
    return clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=False,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )

In [5]:
# Load and melt data
df1 = pd.read_csv(TRAIN_PATH)
df2 = pd.read_csv(TEST_PATH)

exp_train = expand(df1)
exp_test = expand(df2, False)

df = pd.concat([exp_train, exp_test], ignore_index=True)

Cleaner:   0%|          | 0/10145 [00:00<?, ?it/s]

Cleaner:   0%|          | 0/40 [00:00<?, ?it/s]

In [6]:
df.loc[0 ,'text']

"Rule: No Advertising: Spam, referral links, unsolicited advertising, and promotional content are not allowed.\nComment: Banks don't want you to know this! Click here to know more!"

In [7]:
# # Explode the domains into long form
# exploded = df[["label", "domains"]].explode("domains").dropna()

# # Count pos/neg per domain
# domain_stats = (
#     exploded.groupby("domains")["label"]
#     .agg(["sum", "count"])
#     .rename(columns={"sum": "pos", "count": "total"})
# )
# domain_stats["spam_ratio"] = domain_stats["pos"] / (domain_stats["total"] + 1e-6)

In [8]:
# def compute_domain_ratios(df, domain_stats):
#     # Explode for lookup
#     exploded = df[["domains"]].explode("domains")

#     # Map each domain → spam_ratio
#     exploded["spam_ratio"] = exploded["domains"].map(domain_stats["spam_ratio"]).fillna(0.5)

#     # Aggregate back
#     agg = exploded.groupby(level=0)["spam_ratio"].agg(
#         avg_domain_spam_ratio="mean",
#         max_domain_spam_ratio="max"
#     ).fillna(0)

#     return df.join(agg, how="left").fillna(0)

# df = compute_domain_ratios(df, domain_stats)

In [9]:
# # Assume expanded_df has ["labels", "has_url", "num_exclaims", "len_words", "num_spam_words", "has_spam_word"]
# feature_cols = [c for c in df.columns if c not in ["text", "label", "domains", "rule", "subreddit", 'example']]

# corrs = df[["label"] + feature_cols].corr()["label"].sort_values(ascending=False)
# print(corrs)

In [10]:
# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "/kaggle/input/embeddinggemma/transformers/embeddinggemma-300m/1"
model = SentenceTransformer(model_id).to(device=device)

print(f"Device: {model.device}")
print(model)
print("Total number of parameters in the model:", sum([p.numel() for _, p in model.named_parameters()]))

Device: cuda:0
SentenceTransformer(
  (0): Transformer({'max_seq_length': 2048, 'do_lower_case': False}) with Transformer model: Gemma3TextModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 3072, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Dense({'in_features': 3072, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (4): Normalize()
)
Total number of parameters in the model: 307581696


In [12]:
# Encode train and val sets
train_embs = model.encode(
    train_df["text"].tolist(),
    truncate_dim=256,
    prompt_name='Classification',
    normalize_embeddings=True,
    convert_to_numpy=True
)
train_labels = train_df["label"].values

val_embs = model.encode(
    val_df["text"].tolist(),
    truncate_dim=256,
    prompt_name='Classification',
    normalize_embeddings=True,
    convert_to_numpy=True
)
val_labels = val_df["label"].values

# Encode YES/NO labels
label_texts = ["YES", "NO"]
label_embs = model.encode(
    label_texts,
    truncate_dim=256,
    prompt_name='Classification',
    normalize_embeddings=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/255 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
# sims = model.similarity(train_embs, label_embs)
# p_yes = torch.softmax(sims, dim=1)[:, 0].numpy()

# auc_zero_shot = roc_auc_score(train_df["label"].values, p_yes)
# print("Zero-shot cosine AUC:", auc_zero_shot)

# # print("Shape:", sims.shape)
# # print("Sample values:\n", sims)
# # print("Range: min =", sims.min(), ", max =", sims.max())

In [14]:
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV

# #hparam-grid
# param_grid = [
#     # L1 penalty
#     {
#         'penalty': ['l1'],
#         'C': [0.001, 0.01, 0.1, 1.0],
#         'solver': ['liblinear', 'saga']  # Required for L1
#     },
#     # L2 penalty  
#     {
#         'penalty': ['l2'],
#         'C': [0.001, 0.01, 0.1, 1.0],
#         'solver': ['liblinear', 'lbfgs', 'saga']  # L2 works with more solvers
#     },
#     # ElasticNet penalty
#     {
#         'penalty': ['elasticnet'],
#         'C': [0.001, 0.01, 0.1, 1.0],
#         'l1_ratio': [0.1, 0.5, 0.9],
#         'solver': ['saga']  # Only solver that supports elasticnet
#     }
# ]

# grid_search = GridSearchCV(
#     LogisticRegression(max_iter=1000),
#     param_grid, 
#     cv=5, 
#     scoring='roc_auc',
#     n_jobs=-1
# )

# # Fit on training data - GridSearchCV handles CV internally
# grid_search.fit(train_embs, train_labels)

# # Get best model
# best_model = grid_search.best_estimator_
# print(f"Best params: {grid_search.best_params_}")
# print(f"Best CV score: {grid_search.best_score_}")

# # Evaluate on your separate validation set
# val_preds = best_model.predict_proba(val_embs)[:, 1]
# val_auc = roc_auc_score(val_labels, val_preds)
# print(f"Validation AUC: {val_auc}")

In [15]:
# # Best LR
# best_model = LogisticRegression(
#     penalty='l1',  # Combines L1 + L2
#     C=1.0,               # Strong regularization (lower = stronger)
#     max_iter=2000,        # Increase for convergence
#     solver='saga'         # Required for elasticnet
# )
# best_model.fit(train_embs, train_labels)

In [16]:
# Strong regularization setup
lr = LogisticRegression(
    penalty='elasticnet',  # Combines L1 + L2
    C=1,               # Strong regularization (lower = stronger)
    l1_ratio=0.5,         # Balance between L1 and L2
    max_iter=2000,        # Increase for convergence
    solver='saga'         # Required for elasticnet
)

# # CV score
# from sklearn.model_selection import cross_val_score

# cv_score = cross_val_score(lr, train_embs, train_labels, cv=5, scoring='roc_auc', n_jobs=-1)
# print(f"Average CV score: {np.mean(cv_score):.4f}")

In [17]:
# Fit strict model
lr.fit(train_embs, train_labels)

In [18]:
# import optuna
# import lightgbm as lgb
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import StratifiedKFold
# import numpy as np

# def objective(trial):
#     param = {
#         "objective": "binary",
#         "metric": "auc",
#         "boosting_type": "gbdt",  # could try 'dart' later
#         "verbosity": -1,
#         "n_jobs": -1,
        
#         # Regularization knobs
#         "num_leaves": trial.suggest_int("num_leaves", 16, 128),  # smaller → less overfit
#         "max_depth": trial.suggest_int("max_depth", 3, 8),
#         "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
#         "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
        
#         # Sampling
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
        
#         # Learning
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
#         "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        
#         # Regularization penalties
#         "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
#         "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
#     }

#     # CV setup
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     aucs = []

#     for train_idx, val_idx in cv.split(train_embs, train_labels):
#         X_train, X_val = train_embs[train_idx], train_embs[val_idx]
#         y_train, y_val = train_labels[train_idx], train_labels[val_idx]

#         dtrain = lgb.Dataset(X_train, label=y_train)
#         dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

#         model = LGBMClassifier(**param)
#         model.fit(
#             X_train, y_train,
#             eval_set=[(X_val, y_val)],
#             eval_metric="auc",
#             callbacks=[lgb.early_stopping(50, first_metric_only=True, verbose=False)],
#         )
#         aucs.append(model.best_score_["valid_0"]["auc"])

#     return np.mean(aucs)

# # --- Run study ---
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=10, show_progress_bar=True)

# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ", trial.params)

In [19]:
# Test embedding
test_df = df2[['row_id', 'body', 'rule']]
tqdm.pandas(desc="Craft Prompt")
test_df['text'] = test_df.progress_apply(lambda r: make_input(r["rule"], r["body"]), axis=1)
tqdm.pandas(desc='Cleaner')
test_df['text'] = test_df['text'].progress_apply(cleaner)
test_df = test_df[['row_id', 'text']]

Craft Prompt:   0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['text'] = test_df.progress_apply(lambda r: make_input(r["rule"], r["body"]), axis=1)


Cleaner:   0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['text'] = test_df['text'].progress_apply(cleaner)


In [20]:
test_embs = model.encode(
    test_df['text'].tolist(),
    prompt_name="Classification",
    truncate_dim=256,
    normalize_embeddings=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
# Generate probs
probs_test = lr.predict_proba(test_embs)[:, 1]   # probability of YES

submission = pd.DataFrame({
    "row_id": test_df["row_id"],       # or whatever ID column exists
    "rule_violation": probs_test   # or binary (0/1) if required
})
submission.to_csv("submission.csv", index=False)