# E means Embedding model only
This notebook contains only the Qwen3 0.6B embedding model for semantic search classification.

**Note: This submission uses only the 0.6B Qwen3 embedding model for standalone classification.**

In [None]:
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'trl==0.21.0' 'optimum==1.27.0' 'auto-gptq==0.7.1' 'bitsandbytes==0.46.1' 'deepspeed==0.17.4' 'logits-processor-zoo==0.2.1' 'vllm==0.10.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'triton==3.2.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'clean-text'
!uv pip install --system --no-index -U --no-deps --find-links='/kaggle/input/jigsaw-packages2/whls/' 'peft' 'accelerate' 'datasets'

# 1. Qwen3 0.6b Embedding Model

In [None]:
import os
import pandas as pd

In [None]:
%%writefile constants.py
# Choose your embedding model (uncomment one):
# Option 1: Original Qwen3 0.6B (larger, potentially better)
EMBEDDING_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
MODEL_OUTPUT_PATH = '/kaggle/input/qwen3-8b-embedding'
USE_CUSTOM_EMBEDDING = True

# Option 2: Lightweight alternatives (much smaller, faster) - uncomment to use
# EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # 22.7M params, ~90MB
# EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"  # 109M params, ~420MB  
# EMBEDDING_MODEL_NAME = "BAAI/bge-small-en-v1.5"  # 33.4M params, ~130MB
# EMBEDDING_MODEL_NAME = "intfloat/e5-small-v2"  # 33.4M params, ~130MB
# USE_CUSTOM_EMBEDDING = False

DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules"

# Embedding search parameters
EMBEDDING_MODEL_QUERY = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"
CLEAN_TEXT = True
TOP_K = 2000
BATCH_SIZE = 128

In [None]:
%%writefile utils.py
import pandas as pd
import torch.distributed as dist

from datasets import Dataset
from cleantext import clean
from tqdm.auto import tqdm

from constants import CLEAN_TEXT


def build_prompt(row):
    return f"""r/{row["subreddit"]}\nComment: {row["body"]}"""


def cleaner(text):
    return clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=False,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )



def get_dataframe_to_train(data_path):
    train_dataset = pd.read_csv(f"{data_path}/train.csv")
    test_dataset = pd.read_csv(f"{data_path}/test.csv").sample(frac=0.6, random_state=42).reset_index(drop=True)

    flatten = []
    flatten.append(train_dataset[["body", "rule", "subreddit", "rule_violation"]])
    
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            sub_dataset = test_dataset[[f"{violation_type}_example_{i}", "rule", "subreddit"]].copy()
            sub_dataset = sub_dataset.rename(columns={f"{violation_type}_example_{i}": "body"})
            sub_dataset["rule_violation"] = 1 if violation_type == "positive" else 0
            flatten.append(sub_dataset)

    dataframe = pd.concat(flatten, axis=0)    
    dataframe = dataframe.drop_duplicates(ignore_index=True)
    return dataframe


def prepare_dataframe(dataframe):
    dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)

    
    if CLEAN_TEXT:
        tqdm.pandas(desc="cleaner")
        dataframe["prompt"] = dataframe["prompt"].progress_apply(cleaner)

    if "rule_violation" in dataframe.columns:
        dataframe["rule_violation"] = dataframe["rule_violation"].map(
            {
                1: 1,
                0: -1,
            }
        )

    return dataframe

In [None]:
%%writefile semantic.py
import pandas as pd
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, dot_score
from tqdm.auto import tqdm
from peft import PeftModel, PeftConfig


from utils import get_dataframe_to_train, prepare_dataframe
from constants import DATA_PATH, EMBEDDING_MODEL_PATH, EMBEDDING_MODEL_QUERY, TOP_K, BATCH_SIZE, MODEL_OUTPUT_PATH, USE_CUSTOM_EMBEDDING



def get_scores(test_dataframe):
    corpus_dataframe = get_dataframe_to_train(DATA_PATH)
    corpus_dataframe = prepare_dataframe(corpus_dataframe)
    
    if USE_CUSTOM_EMBEDDING:
        # Load base model
        model = AutoModelForCausalLM.from_pretrained(EMBEDDING_MODEL_PATH)
        tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_PATH)
        
        # Load adapter configuration and model
        adapter_config = PeftConfig.from_pretrained(MODEL_OUTPUT_PATH)
        lora_model = PeftModel.from_pretrained(model, MODEL_OUTPUT_PATH, config=adapter_config)
        merged_model = lora_model.merge_and_unload()
        tokenizer.save_pretrained("Qwen3Emb_Finetuned")
        merged_model.save_pretrained("Qwen3Emb_Finetuned")

        # Create SentenceTransformer from merged encoder
        embedding_model = SentenceTransformer(model_name_or_path="Qwen3Emb_Finetuned", device="cuda")
    else:
        # Use lightweight model
        from constants import EMBEDDING_MODEL_NAME
        embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device="cuda")

    print('Done loading model!')

    result = []
    for rule in tqdm(test_dataframe["rule"].unique(), desc=f"Generate scores for each rule"):
        test_dataframe_part = test_dataframe.query("rule == @rule").reset_index(drop=True)
        corpus_dataframe_part = corpus_dataframe.query("rule == @rule").reset_index(drop=True)
        corpus_dataframe_part = corpus_dataframe_part.reset_index(names="row_id")
        
        # Encode embeddings
        if USE_CUSTOM_EMBEDDING:
            query_embeddings = embedding_model.encode(
                sentences=test_dataframe_part["prompt"].tolist(),
                prompt=EMBEDDING_MODEL_QUERY,
                batch_size=BATCH_SIZE,
                show_progress_bar=True,
                convert_to_tensor=True,
                device="cuda",
                normalize_embeddings=True,
            )
        else:
            query_embeddings = embedding_model.encode(
                sentences=test_dataframe_part["prompt"].tolist(),
                batch_size=BATCH_SIZE,
                show_progress_bar=True,
                convert_to_tensor=True,
                device="cuda",
                normalize_embeddings=True,
            )
            
        document_embeddings = embedding_model.encode(
            sentences=corpus_dataframe_part["prompt"].tolist(),
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            convert_to_tensor=True,
            device="cuda",
            normalize_embeddings=True,
        )
        
        test_dataframe_part["semantic"] = semantic_search(
            query_embeddings,
            document_embeddings,
            top_k=TOP_K,
            score_function=dot_score,
        )
        
        def get_score(semantic):
            semantic = pd.DataFrame(semantic)
            semantic = semantic.merge(
                corpus_dataframe_part[["row_id", "rule_violation"]],
                how="left",
                left_on="corpus_id",
                right_on="row_id",
            )
            semantic["score"] = semantic["score"]*semantic["rule_violation"]
            return semantic["score"].sum()
            
        tqdm.pandas(desc=f"Add label for {rule=}")
        test_dataframe_part["rule_violation"] = test_dataframe_part["semantic"].progress_apply(get_score)
        result.append(test_dataframe_part[["row_id", "rule_violation"]].copy())
        
    submission = pd.concat(result, axis=0)
    return submission


def generate_submission():
    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")
    test_dataframe = prepare_dataframe(test_dataframe)
    
    submission = get_scores(test_dataframe)
    submission = test_dataframe[["row_id"]].merge(submission, on="row_id", how="left")
    
    # Rank normalize the predictions
    rq = submission['rule_violation'].rank(method='average') / (len(submission) + 1)
    submission['rule_violation'] = rq
    
    submission.to_csv("/kaggle/working/submission.csv", index=False)
    print("✅ Saved submission.csv using Qwen3 0.6B embedding model only")


if __name__ == "__main__":
    generate_submission()

In [None]:
!python semantic.py

In [None]:
!head /kaggle/working/submission.csv

In [None]:
import pandas as pd
pd.read_csv('/kaggle/working/submission.csv')