# model1 


In [None]:
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'trl==0.21.0' 'optimum==1.27.0' 'auto-gptq==0.7.1' 'bitsandbytes==0.46.1' 'logits-processor-zoo==0.2.1' 'vllm==0.10.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'deepspeed==0.17.4' -q
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'triton==3.2.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'clean-text'
!uv pip install --system --no-index -U --no-deps --find-links='/kaggle/input/jigsaw-packages2/whls/' 'peft' 'accelerate' 'datasets'

In [None]:
%%writefile constants.py

seed = 0

base_model_path = "/kaggle/input/jigsaw-pretrain-public/pytorch/llama-3.2-3b-instruct/1"
pretrain_lora_path = None
lora_path = "/kaggle/working/pseudo_lora"
use_gptq = "gptq" in base_model_path

positive = "Yes"
negative = "No"
judge_words = "Violation:"
system_prompt = '''You are given a comment from reddit and a rule. 
Your task is to classify whether the comment violates the rule. 
Only respond Yes/No.'''

frac = 0.05
use_train = True

import kagglehub

deterministic = kagglehub.package_import('wasupandceacar/deterministic').deterministic
deterministic.init_all(seed)

In [None]:
%%writefile utils.py

import numpy as np
import pandas as pd
from datasets import Dataset
from constants import *

def build_prompt(row):
    return f"""{system_prompt}
Subreddit: r/{row["subreddit"]}
Rule: {row["rule"]}
Examples:
1) {row["positive_example"]}
{judge_words} Yes
2) {row["negative_example"]}
{judge_words} No
Comment: {row["body"]}
{judge_words}"""

def get_df():
    merge = list()
    if use_train:
        train_dataset = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/train.csv")
        train_df = train_dataset[["body", "rule", "subreddit", "rule_violation",
                                "positive_example_1", "positive_example_2", 
                                "negative_example_1", "negative_example_2"]].copy()
        train_df["positive_example"] = np.where(np.random.rand(len(train_df)) < 0.5, train_df["positive_example_1"], train_df["positive_example_2"])
        train_df["negative_example"] = np.where(np.random.rand(len(train_df)) < 0.5, train_df["negative_example_1"], train_df["negative_example_2"])
        train_df.drop(columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"], inplace=True)
        merge.append(train_df)
    test_dataset = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv")
    test_dataset = test_dataset.groupby('rule', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=seed)).reset_index(drop=True)
    print(f"Select {len(test_dataset)} test data")
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            sub_dataset = test_dataset[["rule", "subreddit", "positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"]].copy()
            body_col = f"{violation_type}_example_{i}"
            other_positive_col = f"{violation_type}_example_{3-i}"
            sub_dataset["body"] = sub_dataset[body_col]
            sub_dataset[f"{violation_type}_example"] = sub_dataset[other_positive_col]
            anti_violation_type = "negative" if violation_type == "positive" else "positive"
            sub_dataset[f"{anti_violation_type}_example"] = np.where(np.random.rand(len(sub_dataset)) < 0.5, sub_dataset[f"{anti_violation_type}_example_1"], sub_dataset[f"{anti_violation_type}_example_2"])
            sub_dataset["rule_violation"] = 1 if violation_type == "positive" else 0
            sub_dataset.drop(columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"], inplace=True)
            merge.append(sub_dataset)
    return pd.concat(merge, axis=0).drop_duplicates(ignore_index=True)

def build_dataset(df):
    df["prompt"] = df.apply(build_prompt, axis=1)
    columns = ["prompt"]
    if "rule_violation" in df:
        df["completion"] = df["rule_violation"].map({
            1: positive,
            0: negative,})
        columns.append("completion")
    dataset = Dataset.from_pandas(df[columns])
    return dataset

In [None]:
%%writefile train.py

import torch
import pandas as pd
from trl import SFTTrainer, SFTConfig
from peft import PeftModel, LoraConfig, get_peft_model
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.utils import is_torch_bf16_gpu_available

from utils import *
from constants import *

def main():
    train_dataset = build_dataset(get_df())
    lora_config = LoraConfig(
        r=64,
        lora_alpha=128,
        lora_dropout=0.1,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type="CAUSAL_LM",
    )
    
    training_args = SFTConfig(
        num_train_epochs=1,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        learning_rate=1e-4,
        weight_decay=0.01,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        bf16=is_torch_bf16_gpu_available(),
        fp16=not is_torch_bf16_gpu_available(),
        dataloader_pin_memory=True,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        save_strategy="no",
        report_to="none",
        completion_only_loss=True,
        packing=False,
        remove_unused_columns=False,
    )

    if use_gptq:
        model = AutoModelForCausalLM.from_pretrained(
            base_model_path,
            device_map="balanced_low_0",
            trust_remote_code=True,
            use_cache=False,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            base_model_path,
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,     
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
            ),
            device_map="balanced_low_0",
            trust_remote_code=True,
            use_cache=False,
        )
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    tokenizer.pad_token = tokenizer.eos_token
    if pretrain_lora_path:
        model = PeftModel.from_pretrained(model, pretrain_lora_path)
        model = model.merge_and_unload()

    if len(train_dataset) > 0:
        trainer = SFTTrainer(
            model=model,
            processing_class=tokenizer,
            args=training_args,
            train_dataset=train_dataset,
            peft_config=lora_config,
        )
        trainer.train()
        trainer.save_model(lora_path)
    else:
        peft_model = get_peft_model(model, lora_config)
        peft_model.save_pretrained(lora_path)
        tokenizer.save_pretrained(lora_path)

if __name__ == "__main__":
    main()

In [None]:
%%writefile accelerate_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_micro_batch_size_per_gpu: 4
  
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  # fp16:
  #   enabled: true
  #   loss_scale: 0
  #   initial_scale_power: 16
  #   loss_scale_window: 1000
  #   hysteresis: 2
  #   min_loss_scale: 1
  bf16:
    enabled: true
  
distributed_type: DEEPSPEED
downcast_bf16: 'yes'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

In [None]:
%%writefile inference.py

import os
os.environ["VLLM_USE_V1"] = "0"

import random
import vllm
import torch
import numpy as np
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from vllm.lora.request import LoRARequest
from utils import build_dataset
from constants import *
import multiprocessing as mp

def run_inference_on_device(df_slice):
    llm = vllm.LLM(
        base_model_path,
        quantization="gptq" if use_gptq else None,
        tensor_parallel_size=1,
        gpu_memory_utilization=0.98,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=2048,
        disable_log_stats=True,
        enable_prefix_caching=True,
        enable_lora=True,
        max_lora_rank=64,
    )
    tokenizer = llm.get_tokenizer()
    outputs = llm.generate(
        build_dataset(df_slice)["prompt"],
        vllm.SamplingParams(
            skip_special_tokens=True,
            max_tokens=1,
            logits_processors=[MultipleChoiceLogitsProcessor(tokenizer, choices=[positive, negative])],
            logprobs=2,
        ),
        use_tqdm=True,
        lora_request=LoRARequest("lora1", 1, lora_path)
    )
    log_probs = [{lp.decoded_token: np.exp(lp.logprob) for lp in out.outputs[0].logprobs[0].values()} for out in outputs]
    predictions = pd.DataFrame(log_probs)[[positive, negative]]
    predictions["row_id"] = df_slice["row_id"].values
    return predictions

def worker(device_id, df_slice, return_dict):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    print(f"[Worker {device_id}] Running on GPU {device_id}, data size={len(df_slice)}")
    preds = run_inference_on_device(df_slice)
    return_dict[device_id] = preds

def main():
    test_df = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv")
    test_df["positive_example"] = test_df.apply(lambda row: random.choice([row["positive_example_1"], row["positive_example_2"]]), axis=1)
    test_df["negative_example"] = test_df.apply(lambda row: random.choice([row["negative_example_1"], row["negative_example_2"]]), axis=1)
    test_df = test_df.drop(columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"], errors="ignore")

    mid = len(test_df) // 2
    df0 = test_df.iloc[:mid].reset_index(drop=True)
    df1 = test_df.iloc[mid:].reset_index(drop=True)

    manager = mp.Manager()
    return_dict = manager.dict()
    p0 = mp.Process(target=worker, args=(0, df0, return_dict))
    p1 = mp.Process(target=worker, args=(1, df1, return_dict))
    p0.start()
    p1.start()
    p0.join()
    p1.join()

    predictions = pd.concat([return_dict[0], return_dict[1]], ignore_index=True)
    submission = predictions[["row_id", positive]].rename(columns={positive: "rule_violation"})
    submission.to_csv("/kaggle/working/submission5.csv", index=False)

if __name__ == "__main__":
    main()

In [None]:
!accelerate launch --config_file accelerate_config.yaml train.py
!python inference.py

import pandas as pd
pd.read_csv('/kaggle/working/submission5.csv')

# model2 


In [None]:
#!/usr/bin/env python3

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import random
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from sentence_transformers.losses import TripletLoss
from sklearn.metrics.pairwise import cosine_similarity
import re
from urllib.parse import urlparse
import faiss
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')


def cleaner(text):
    """Replace URLs with format: <url>: (domain/important-path)"""
    if not text:
        return text

    # Regex pattern to match URLs
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'

    def replace_url(match):
        url = match.group(0)
        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower()
            # Remove www. prefix if present
            if domain.startswith('www.'):
                domain = domain[4:]

            # Extract meaningful path parts (first 1-2 segments)
            path_parts = [part for part in parsed.path.split('/') if part]
            if path_parts:
                # Take first 1-2 meaningful path segments
                important_path = '/'.join(path_parts[:2])
                return f"<url>: ({domain}/{important_path})"
            else:
                return f"<url>: ({domain})"
        except:
            return "<url>: (unknown)"

    return re.sub(url_pattern, replace_url, str(text))


def load_test_data():
    """Load test data."""
    print("Loading test data...")
    test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
    print(f"Loaded {len(test_df)} test examples")
    print(f"Unique rules: {test_df['rule'].nunique()}")
    return test_df


def collect_all_texts(test_df):
    """Collect all unique texts from test set."""
    print("\nCollecting all texts for embedding...")
    
    all_texts = set()
    
    # Add all bodies
    for body in test_df['body']:
        if pd.notna(body):
            all_texts.add(cleaner(str(body)))
    
    # Add all positive and negative examples
    example_cols = ['positive_example_1', 'positive_example_2', 
                   'negative_example_1', 'negative_example_2']
    
    for col in example_cols:
        for example in test_df[col]:
            if pd.notna(example):
                all_texts.add(cleaner(str(example)))
    
    all_texts = list(all_texts)
    print(f"Collected {len(all_texts)} unique texts")
    return all_texts


def generate_embeddings(texts, model, batch_size=64):
    """Generate BGE embeddings for all texts."""
    print(f"Generating embeddings for {len(texts)} texts...")
    
    embeddings = model.encode(
        sentences=texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_tensor=False,
        normalize_embeddings=True
    )
    
    return embeddings


def create_test_triplet_dataset(test_df, augmentation_factor=2, random_seed=42, subsample_fraction=1.0):
    """Create triplet dataset from test data: anchor=rule, positive=positive_example, negative=negative_example."""
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    anchors = []
    positives = []
    negatives = []
    
    print("Creating rule-aligned triplets from test data...")
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing test rows"):
        rule = cleaner(str(row['rule']))
        
        pos_examples = []  # Will contain compliant comments (rule-aligned)
        neg_examples = []  # Will contain violating comments (rule-misaligned)

        for neg_col in ['negative_example_1', 'negative_example_2']:  # Compliant → triplet positive
            if pd.notna(row[neg_col]):
                pos_examples.append(cleaner(str(row[neg_col])))

        for pos_col in ['positive_example_1', 'positive_example_2']:  # Violating → triplet negative
            if pd.notna(row[pos_col]):
                neg_examples.append(cleaner(str(row[pos_col])))
        
        for pos_ex in pos_examples:
            for neg_ex in neg_examples:
                anchors.append(rule)
                positives.append(pos_ex)
                negatives.append(neg_ex)
    
    if augmentation_factor > 0:
        print(f"Adding {augmentation_factor}x augmentation...")
        
        rule_positives = {}
        rule_negatives = {}
        
        for rule in test_df['rule'].unique():
            rule_df = test_df[test_df['rule'] == rule]
            
            pos_pool = []
            neg_pool = []
            
            for _, row in rule_df.iterrows():
                for neg_col in ['negative_example_1', 'negative_example_2']:  # Compliant → triplet positive
                    if pd.notna(row[neg_col]):
                        pos_pool.append(cleaner(str(row[neg_col])))
                for pos_col in ['positive_example_1', 'positive_example_2']:  # Violating → triplet negative
                    if pd.notna(row[pos_col]):
                        neg_pool.append(cleaner(str(row[pos_col])))
            
            rule_positives[rule] = list(set(pos_pool))
            rule_negatives[rule] = list(set(neg_pool))
        
        for rule in test_df['rule'].unique():
            clean_rule = cleaner(str(rule))
            pos_pool = rule_positives[rule]
            neg_pool = rule_negatives[rule]
            
            n_samples = min(augmentation_factor * len(pos_pool), len(pos_pool) * len(neg_pool))
            
            for _ in range(n_samples):
                if pos_pool and neg_pool:
                    anchors.append(clean_rule)
                    positives.append(random.choice(pos_pool))
                    negatives.append(random.choice(neg_pool))
    
    combined = list(zip(anchors, positives, negatives))
    random.shuffle(combined)
    
    # Apply subsampling if requested
    original_count = len(combined)
    if subsample_fraction < 1.0:
        n_samples = int(len(combined) * subsample_fraction)
        combined = combined[:n_samples]
        print(f"Subsampled {original_count} -> {len(combined)} triplets ({subsample_fraction*100:.1f}%)")
    
    anchors, positives, negatives = zip(*combined) if combined else ([], [], [])
    
    print(f"Created {len(anchors)} triplets from test data")
    
    dataset = Dataset.from_dict({
        'anchor': list(anchors),
        'positive': list(positives),
        'negative': list(negatives)
    })
    
    return dataset


def fine_tune_model(model, train_dataset, epochs=3, batch_size=32, learning_rate=2e-5, margin=0.25, output_dir="./models/test-finetuned-bge"):
    """Fine-tune the sentence transformer model using triplet loss on test data."""
    
    print(f"Fine-tuning model on {len(train_dataset)} triplets...")
    
    loss = TripletLoss(model=model, triplet_margin=margin)
    
    # Calculate max_steps for small datasets
    dataset_size = len(train_dataset)
    steps_per_epoch = max(1, dataset_size // batch_size)
    max_steps = steps_per_epoch * epochs

    args = SentenceTransformerTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        warmup_steps=0,
        learning_rate=learning_rate,
        logging_steps=max(1, max_steps // 4),
        save_strategy="epoch",
        save_total_limit=1,
        fp16=True,
        max_grad_norm=1.0,
        dataloader_drop_last=False,
        gradient_checkpointing=True,
        gradient_accumulation_steps = 1,
        max_steps=max_steps,
        report_to="none"
    )
    
    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        loss=loss,
    )
    
    trainer.train()
    
    final_model_path = f"{output_dir}/final"
    print(f"Saving fine-tuned model to {final_model_path}...")
    model.save_pretrained(final_model_path)
    
    return model, final_model_path


def load_or_create_finetuned_model(test_df):
    """Load fine-tuned model if exists, otherwise create and fine-tune it."""
    
    fine_tuned_path = "./models/test-finetuned-bge/final2"
    
    if os.path.exists(fine_tuned_path):
        print(f"Loading existing fine-tuned model from {fine_tuned_path}...")
        try:
            word_embedding_model = models.Transformer(fine_tuned_path, max_seq_length=128, do_lower_case=True)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
            model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            print("Loaded fine-tuned model with explicit pooling")
        except:
            model = SentenceTransformer(fine_tuned_path)
            print("Loaded fine-tuned model with default configuration")
        model.half()
        return model
    
    print("Fine-tuned model not found. Creating new one...")
    
    print("Loading base BGE embedding model...")
    # Try Kaggle path first, fallback to HuggingFace
    try:
        model_path = "/kaggle/input/baai/transformers/bge-large-en-v1.5/1"
        word_embedding_model = models.Transformer(model_path, max_seq_length=128, do_lower_case=True)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
        base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        print("Loaded base model from Kaggle path with explicit pooling")
    except:
        model_path = ""  # BAAI/bge-small-en-v1.5
        word_embedding_model = models.Transformer(model_path, max_seq_length=128, do_lower_case=True)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
        base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        print("Loaded base model from local path with explicit pooling")
    
    
    triplet_dataset = create_test_triplet_dataset(test_df, augmentation_factor=2, subsample_fraction=1.)
    
    fine_tuned_model, model_path = fine_tune_model(
        model=base_model,
        train_dataset=triplet_dataset,
        epochs=1,
        batch_size=32,
        learning_rate=2e-5,
        margin=0.25
    )
    
    print(f"Fine-tuning completed. Model saved to: {model_path}")
    fine_tuned_model.half()
    return fine_tuned_model


def generate_rule_embeddings(test_df, model):
    """Generate embeddings for each unique rule."""
    print("Generating rule embeddings...")
    
    unique_rules = test_df['rule'].unique()
    rule_embeddings = {}
    
    for rule in unique_rules:
        clean_rule = cleaner(str(rule))
        rule_emb = model.encode(
            clean_rule,
            convert_to_tensor=False,
            normalize_embeddings=True
        )
        rule_embeddings[rule] = rule_emb
        
    print(f"Generated embeddings for {len(rule_embeddings)} rules")
    return rule_embeddings


def create_rule_centroids(test_df, text_to_embedding, rule_embeddings):
    """Create single centroid (mean) for positive and negative examples for each rule."""
    print(f"\nCreating rule centroids (single mean centroid per type)...")

    rule_centroids = {}

    for rule in test_df['rule'].unique():
        rule_data = test_df[test_df['rule'] == rule]

        # Collect positive examples
        pos_embeddings = []
        for _, row in rule_data.iterrows():
            for col in ['positive_example_1', 'positive_example_2']:
                if pd.notna(row[col]):
                    clean_text = cleaner(str(row[col]))
                    if clean_text in text_to_embedding:
                        pos_embeddings.append(text_to_embedding[clean_text])

        # Collect negative examples
        neg_embeddings = []
        for _, row in rule_data.iterrows():
            for col in ['negative_example_1', 'negative_example_2']:
                if pd.notna(row[col]):
                    clean_text = cleaner(str(row[col]))
                    if clean_text in text_to_embedding:
                        neg_embeddings.append(text_to_embedding[clean_text])

        if pos_embeddings and neg_embeddings:
            pos_embeddings = np.array(pos_embeddings)
            neg_embeddings = np.array(neg_embeddings)

            # Compute mean centroids
            pos_centroid = pos_embeddings.mean(axis=0)
            neg_centroid = neg_embeddings.mean(axis=0)

            # Normalize centroids
            pos_centroid = pos_centroid / np.linalg.norm(pos_centroid)
            neg_centroid = neg_centroid / np.linalg.norm(neg_centroid)

            rule_centroids[rule] = {
                'positive': pos_centroid,
                'negative': neg_centroid,
                'pos_count': len(pos_embeddings),
                'neg_count': len(neg_embeddings),
                'rule_embedding': rule_embeddings[rule]
            }

            print(f"  Rule: {rule[:50]}... - Pos: {len(pos_embeddings)}, Neg: {len(neg_embeddings)}")

    print(f"Created centroids for {len(rule_centroids)} rules")
    return rule_centroids


def predict_test_set(test_df, text_to_embedding, rule_centroids):
    """Predict test set using Euclidean distance between body and pos/neg centroids."""
    print("\nMaking predictions on test set with Euclidean distance...")

    row_ids = []
    predictions = []

    for rule in test_df['rule'].unique():
        print(f"  Processing rule: {rule[:50]}...")
        rule_data = test_df[test_df['rule'] == rule]

        if rule not in rule_centroids:
            continue

        pos_centroid = rule_centroids[rule]['positive']
        neg_centroid = rule_centroids[rule]['negative']

        # Collect all valid embeddings and row_ids for this rule
        valid_embeddings = []
        valid_row_ids = []

        for _, row in rule_data.iterrows():
            body = cleaner(str(row['body']))
            row_id = row['row_id']

            if body in text_to_embedding:
                valid_embeddings.append(text_to_embedding[body])
                valid_row_ids.append(row_id)

        if not valid_embeddings:
            continue

        # Convert to numpy array
        query_embeddings = np.array(valid_embeddings)

        # Compute Euclidean distances
        pos_distances = np.linalg.norm(query_embeddings - pos_centroid, axis=1)
        neg_distances = np.linalg.norm(query_embeddings - neg_centroid, axis=1)

        # Score: closer to positive (lower distance) = higher violation score
        rule_predictions = neg_distances - pos_distances

        row_ids.extend(valid_row_ids)
        predictions.extend(rule_predictions)

    print(f"Made predictions for {len(predictions)} test examples")
    return row_ids, np.array(predictions)




def main():
    """Main inference pipeline."""
    print("="*70)
    print("SIMPLE SIMILARITY CLASSIFIER - INFERENCE")
    print("="*70)
    
    # Step 1: Load test data
    test_df = load_test_data()
    
    # Step 2: Load or create fine-tuned model
    print("\n" + "="*50)
    print("MODEL PREPARATION PHASE")
    print("="*50)
    model = load_or_create_finetuned_model(test_df)
    
    # Step 3: Collect all texts
    all_texts = collect_all_texts(test_df)
    
    # Step 4: Generate embeddings with fine-tuned model
    print("\n" + "="*50)
    print("EMBEDDING GENERATION PHASE")
    print("="*50)
    all_embeddings = generate_embeddings(all_texts, model)
    
    # Step 5: Create text to embedding mapping
    text_to_embedding = {text: emb for text, emb in zip(all_texts, all_embeddings)}
    
    # Step 6: Generate rule embeddings
    rule_embeddings = generate_rule_embeddings(test_df, model)
    
    # Step 7: Create rule centroids from test examples
    rule_centroids = create_rule_centroids(test_df, text_to_embedding, rule_embeddings)
    
    # Step 8: Predict test set
    print("\n" + "="*50)
    print("PREDICTION PHASE")
    print("="*50)
    row_ids, predictions = predict_test_set(test_df, text_to_embedding, rule_centroids)
    
    min_vals = predictions.min(axis=0, keepdims=True)  #
    max_vals = predictions.max(axis=0, keepdims=True)  #
    range_vals = max_vals - min_vals
    range_vals = np.where(range_vals == 0, 1, range_vals)
    normalized = (predictions - min_vals) / range_vals
    
    # Step 9: Create submission with rule-conditioned scores
    submission_df = pd.DataFrame({
        'row_id': row_ids,
        'rule_violation': normalized
    })
    
    
    submission_df.to_csv('submission1.csv', index=False)
    print(f"\nSaved predictions for {len(submission_df)} test examples to submission1.csv")
    
    print(f"\n{'='*70}")
    print(f"FINE-TUNED EUCLIDEAN DISTANCE INFERENCE COMPLETED")
    print(f"Model: Fine-tuned BGE on test data triplets")
    print(f"Method: Single centroid with Euclidean distance")
    print(f"Predicted on {len(test_df)} test examples")
    print(f"Prediction stats: min={predictions.min():.4f}, max={predictions.max():.4f}, mean={predictions.mean():.4f}")
    print(f"{'='*70}")


if __name__ == "__main__":
    main()


# model3 

In [None]:
#!/usr/bin/env python3

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import random
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from sentence_transformers.losses import TripletLoss
from sklearn.metrics.pairwise import cosine_similarity
import re
from urllib.parse import urlparse
import faiss
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')


def cleaner(text):
    """Replace URLs with format: <url>: (domain/important-path)"""
    if not text:
        return text

    # Regex pattern to match URLs
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'

    def replace_url(match):
        url = match.group(0)
        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower()
            # Remove www. prefix if present
            if domain.startswith('www.'):
                domain = domain[4:]

            # Extract meaningful path parts (first 1-2 segments)
            path_parts = [part for part in parsed.path.split('/') if part]
            if path_parts:
                # Take first 1-2 meaningful path segments
                important_path = '/'.join(path_parts[:2])
                return f"<url>: ({domain}/{important_path})"
            else:
                return f"<url>: ({domain})"
        except:
            return "<url>: (unknown)"

    return re.sub(url_pattern, replace_url, str(text))


def load_test_data():
    """Load test data."""
    print("Loading test data...")
    test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
    print(f"Loaded {len(test_df)} test examples")
    print(f"Unique rules: {test_df['rule'].nunique()}")
    return test_df


def collect_all_texts(test_df):
    """Collect all unique texts from test set."""
    print("\nCollecting all texts for embedding...")
    
    all_texts = set()
    
    # Add all bodies
    for body in test_df['body']:
        if pd.notna(body):
            all_texts.add(cleaner(str(body)))
    
    # Add all positive and negative examples
    example_cols = ['positive_example_1', 'positive_example_2', 
                   'negative_example_1', 'negative_example_2']
    
    for col in example_cols:
        for example in test_df[col]:
            if pd.notna(example):
                all_texts.add(cleaner(str(example)))
    
    all_texts = list(all_texts)
    print(f"Collected {len(all_texts)} unique texts")
    return all_texts


def generate_embeddings(texts, model, batch_size=64):
    """Generate BGE embeddings for all texts."""
    print(f"Generating embeddings for {len(texts)} texts...")
    
    embeddings = model.encode(
        sentences=texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_tensor=False,
        normalize_embeddings=True
    )
    
    return embeddings


def create_test_triplet_dataset(test_df, augmentation_factor=2, random_seed=42, subsample_fraction=1.0):
    """Create triplet dataset from test data: anchor=rule, positive=positive_example, negative=negative_example."""
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    anchors = []
    positives = []
    negatives = []
    
    print("Creating rule-aligned triplets from test data...")
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing test rows"):
        rule = cleaner(str(row['rule']))
        
        pos_examples = []  # Will contain compliant comments (rule-aligned)
        neg_examples = []  # Will contain violating comments (rule-misaligned)

        for neg_col in ['negative_example_1', 'negative_example_2']:  # Compliant → triplet positive
            if pd.notna(row[neg_col]):
                pos_examples.append(cleaner(str(row[neg_col])))

        for pos_col in ['positive_example_1', 'positive_example_2']:  # Violating → triplet negative
            if pd.notna(row[pos_col]):
                neg_examples.append(cleaner(str(row[pos_col])))
        
        for pos_ex in pos_examples:
            for neg_ex in neg_examples:
                anchors.append(rule)
                positives.append(pos_ex)
                negatives.append(neg_ex)
    
    if augmentation_factor > 0:
        print(f"Adding {augmentation_factor}x augmentation...")
        
        rule_positives = {}
        rule_negatives = {}
        
        for rule in test_df['rule'].unique():
            rule_df = test_df[test_df['rule'] == rule]
            
            pos_pool = []
            neg_pool = []
            
            for _, row in rule_df.iterrows():
                for neg_col in ['negative_example_1', 'negative_example_2']:  # Compliant → triplet positive
                    if pd.notna(row[neg_col]):
                        pos_pool.append(cleaner(str(row[neg_col])))
                for pos_col in ['positive_example_1', 'positive_example_2']:  # Violating → triplet negative
                    if pd.notna(row[pos_col]):
                        neg_pool.append(cleaner(str(row[pos_col])))
            
            rule_positives[rule] = list(set(pos_pool))
            rule_negatives[rule] = list(set(neg_pool))
        
        for rule in test_df['rule'].unique():
            clean_rule = cleaner(str(rule))
            pos_pool = rule_positives[rule]
            neg_pool = rule_negatives[rule]
            
            n_samples = min(augmentation_factor * len(pos_pool), len(pos_pool) * len(neg_pool))
            
            for _ in range(n_samples):
                if pos_pool and neg_pool:
                    anchors.append(clean_rule)
                    positives.append(random.choice(pos_pool))
                    negatives.append(random.choice(neg_pool))
    
    combined = list(zip(anchors, positives, negatives))
    random.shuffle(combined)
    
    # Apply subsampling if requested
    original_count = len(combined)
    if subsample_fraction < 1.0:
        n_samples = int(len(combined) * subsample_fraction)
        combined = combined[:n_samples]
        print(f"Subsampled {original_count} -> {len(combined)} triplets ({subsample_fraction*100:.1f}%)")
    
    anchors, positives, negatives = zip(*combined) if combined else ([], [], [])
    
    print(f"Created {len(anchors)} triplets from test data")
    
    dataset = Dataset.from_dict({
        'anchor': list(anchors),
        'positive': list(positives),
        'negative': list(negatives)
    })
    
    return dataset


def fine_tune_model(model, train_dataset, epochs=3, batch_size=32, learning_rate=2e-5, margin=0.25, output_dir="./models/test-finetuned-bge"):
    """Fine-tune the sentence transformer model using triplet loss on test data."""
    
    print(f"Fine-tuning model on {len(train_dataset)} triplets...")
    
    loss = TripletLoss(model=model, triplet_margin=margin)
    
    # Calculate max_steps for small datasets
    dataset_size = len(train_dataset)
    steps_per_epoch = max(1, dataset_size // batch_size)
    max_steps = steps_per_epoch * epochs

    args = SentenceTransformerTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        warmup_steps=0,
        learning_rate=learning_rate,
        logging_steps=max(1, max_steps // 4),
        save_strategy="epoch",
        save_total_limit=1,
        fp16=True,
        max_grad_norm=1.0,
        dataloader_drop_last=False,
        gradient_checkpointing=True,
        gradient_accumulation_steps = 1,
        max_steps=max_steps,
        report_to="none"
    )
    
    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        loss=loss,
    )
    
    trainer.train()
    
    final_model_path = f"{output_dir}/final"
    print(f"Saving fine-tuned model to {final_model_path}...")
    model.save_pretrained(final_model_path)
    
    return model, final_model_path


def load_or_create_finetuned_model(test_df):
    """Load fine-tuned model if exists, otherwise create and fine-tune it."""
    
    fine_tuned_path = "./models/test-finetuned-bge/final3"
    
    if os.path.exists(fine_tuned_path):
        print(f"Loading existing fine-tuned model from {fine_tuned_path}...")
        try:
            word_embedding_model = models.Transformer(fine_tuned_path, max_seq_length=128, do_lower_case=True)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
            model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            print("Loaded fine-tuned model with explicit pooling")
        except:
            model = SentenceTransformer(fine_tuned_path)
            print("Loaded fine-tuned model with default configuration")
        model.half()
        return model
    
    print("Fine-tuned model not found. Creating new one...")
    
    print("Loading base BGE embedding model...")
    # Try Kaggle path first, fallback to HuggingFace
    try:
        model_path = "/kaggle/input/baai/transformers/bge-base-en-v1.5/1"
        word_embedding_model = models.Transformer(model_path, max_seq_length=128, do_lower_case=True)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
        base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        print("Loaded base model from Kaggle path with explicit pooling")
    except:
        model_path = ""  # BAAI/bge-small-en-v1.5
        word_embedding_model = models.Transformer(model_path, max_seq_length=128, do_lower_case=True)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
        base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        print("Loaded base model from local path with explicit pooling")
    
    
    triplet_dataset = create_test_triplet_dataset(test_df, augmentation_factor=2, subsample_fraction=1.)
    
    fine_tuned_model, model_path = fine_tune_model(
        model=base_model,
        train_dataset=triplet_dataset,
        epochs=1,
        batch_size=32,
        learning_rate=2e-5,
        margin=0.25
    )
    
    print(f"Fine-tuning completed. Model saved to: {model_path}")
    fine_tuned_model.half()
    return fine_tuned_model


def generate_rule_embeddings(test_df, model):
    """Generate embeddings for each unique rule."""
    print("Generating rule embeddings...")
    
    unique_rules = test_df['rule'].unique()
    rule_embeddings = {}
    
    for rule in unique_rules:
        clean_rule = cleaner(str(rule))
        rule_emb = model.encode(
            clean_rule,
            convert_to_tensor=False,
            normalize_embeddings=True
        )
        rule_embeddings[rule] = rule_emb
        
    print(f"Generated embeddings for {len(rule_embeddings)} rules")
    return rule_embeddings


def create_rule_centroids(test_df, text_to_embedding, rule_embeddings):
    """Create single centroid (mean) for positive and negative examples for each rule."""
    print(f"\nCreating rule centroids (single mean centroid per type)...")

    rule_centroids = {}

    for rule in test_df['rule'].unique():
        rule_data = test_df[test_df['rule'] == rule]

        # Collect positive examples
        pos_embeddings = []
        for _, row in rule_data.iterrows():
            for col in ['positive_example_1', 'positive_example_2']:
                if pd.notna(row[col]):
                    clean_text = cleaner(str(row[col]))
                    if clean_text in text_to_embedding:
                        pos_embeddings.append(text_to_embedding[clean_text])

        # Collect negative examples
        neg_embeddings = []
        for _, row in rule_data.iterrows():
            for col in ['negative_example_1', 'negative_example_2']:
                if pd.notna(row[col]):
                    clean_text = cleaner(str(row[col]))
                    if clean_text in text_to_embedding:
                        neg_embeddings.append(text_to_embedding[clean_text])

        if pos_embeddings and neg_embeddings:
            pos_embeddings = np.array(pos_embeddings)
            neg_embeddings = np.array(neg_embeddings)

            # Compute mean centroids
            pos_centroid = pos_embeddings.mean(axis=0)
            neg_centroid = neg_embeddings.mean(axis=0)

            # Normalize centroids
            pos_centroid = pos_centroid / np.linalg.norm(pos_centroid)
            neg_centroid = neg_centroid / np.linalg.norm(neg_centroid)

            rule_centroids[rule] = {
                'positive': pos_centroid,
                'negative': neg_centroid,
                'pos_count': len(pos_embeddings),
                'neg_count': len(neg_embeddings),
                'rule_embedding': rule_embeddings[rule]
            }

            print(f"  Rule: {rule[:50]}... - Pos: {len(pos_embeddings)}, Neg: {len(neg_embeddings)}")

    print(f"Created centroids for {len(rule_centroids)} rules")
    return rule_centroids


def predict_test_set(test_df, text_to_embedding, rule_centroids):
    """Predict test set using Euclidean distance between body and pos/neg centroids."""
    print("\nMaking predictions on test set with Euclidean distance...")

    row_ids = []
    predictions = []

    for rule in test_df['rule'].unique():
        print(f"  Processing rule: {rule[:50]}...")
        rule_data = test_df[test_df['rule'] == rule]

        if rule not in rule_centroids:
            continue

        pos_centroid = rule_centroids[rule]['positive']
        neg_centroid = rule_centroids[rule]['negative']

        # Collect all valid embeddings and row_ids for this rule
        valid_embeddings = []
        valid_row_ids = []

        for _, row in rule_data.iterrows():
            body = cleaner(str(row['body']))
            row_id = row['row_id']

            if body in text_to_embedding:
                valid_embeddings.append(text_to_embedding[body])
                valid_row_ids.append(row_id)

        if not valid_embeddings:
            continue

        # Convert to numpy array
        query_embeddings = np.array(valid_embeddings)

        # Compute Euclidean distances
        pos_distances = np.linalg.norm(query_embeddings - pos_centroid, axis=1)
        neg_distances = np.linalg.norm(query_embeddings - neg_centroid, axis=1)

        # Score: closer to positive (lower distance) = higher violation score
        rule_predictions = neg_distances - pos_distances

        row_ids.extend(valid_row_ids)
        predictions.extend(rule_predictions)

    print(f"Made predictions for {len(predictions)} test examples")
    return row_ids, np.array(predictions)




def main():
    """Main inference pipeline."""
    print("="*70)
    print("SIMPLE SIMILARITY CLASSIFIER - INFERENCE")
    print("="*70)
    
    # Step 1: Load test data
    test_df = load_test_data()
    
    # Step 2: Load or create fine-tuned model
    print("\n" + "="*50)
    print("MODEL PREPARATION PHASE")
    print("="*50)
    model = load_or_create_finetuned_model(test_df)
    
    # Step 3: Collect all texts
    all_texts = collect_all_texts(test_df)
    
    # Step 4: Generate embeddings with fine-tuned model
    print("\n" + "="*50)
    print("EMBEDDING GENERATION PHASE")
    print("="*50)
    all_embeddings = generate_embeddings(all_texts, model)
    
    # Step 5: Create text to embedding mapping
    text_to_embedding = {text: emb for text, emb in zip(all_texts, all_embeddings)}
    
    # Step 6: Generate rule embeddings
    rule_embeddings = generate_rule_embeddings(test_df, model)
    
    # Step 7: Create rule centroids from test examples
    rule_centroids = create_rule_centroids(test_df, text_to_embedding, rule_embeddings)
    
    # Step 8: Predict test set
    print("\n" + "="*50)
    print("PREDICTION PHASE")
    print("="*50)
    row_ids, predictions = predict_test_set(test_df, text_to_embedding, rule_centroids)

    min_vals = predictions.min(axis=0, keepdims=True)  #
    max_vals = predictions.max(axis=0, keepdims=True)  #
    range_vals = max_vals - min_vals
    range_vals = np.where(range_vals == 0, 1, range_vals)
    normalized = (predictions - min_vals) / range_vals
    
    # Step 9: Create submission with rule-conditioned scores
    submission_df = pd.DataFrame({
        'row_id': row_ids,
        'rule_violation': normalized
    })
    
    submission_df.to_csv('submission3.csv', index=False)
    print(f"\nSaved predictions for {len(submission_df)} test examples to submission.csv")
    
    print(f"\n{'='*70}")
    print(f"FINE-TUNED EUCLIDEAN DISTANCE INFERENCE COMPLETED")
    print(f"Model: Fine-tuned BGE on test data triplets")
    print(f"Method: Single centroid with Euclidean distance")
    print(f"Predicted on {len(test_df)} test examples")
    print(f"Prediction stats: min={predictions.min():.4f}, max={predictions.max():.4f}, mean={predictions.mean():.4f}")
    print(f"{'='*70}")


if __name__ == "__main__":
    main()

# Combined Submission

In [None]:
import pandas as pd

q = pd.read_csv('submission1.csv')
l = pd.read_csv('submission5.csv')
m = pd.read_csv('submission3.csv')

q = q.sort_values('row_id').reset_index(drop=True)
l = l.sort_values('row_id').reset_index(drop=True)
m = m.sort_values('row_id').reset_index(drop=True)

assert (q['row_id'].values == l['row_id'].values).all(), "row_id 不匹配：submission1 和 submission_qwen"
assert (q['row_id'].values == m['row_id'].values).all(), "row_id 不匹配：submission1 和 submission3"

def rank_normalize(series):
    return series.rank(method='average') / (len(series) + 1)

rq = rank_normalize(q['rule_violation'])
rl = rank_normalize(l['rule_violation'])
rm = rank_normalize(m['rule_violation'])

blend = 0.25 * rq + 0.25 * rm + 0.5 * rl

q['rule_violation'] = blend
q.to_csv('/kaggle/working/submission.csv', index=False)