## **Kaggle Submission Generation**

In [3]:
import json
import csv
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import gdown
import torch

print("Loading data")

def download_and_load_json(file_id, filename):
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, filename, quiet=False)
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def download_and_load_npz(file_id, filename):
    """ Download a file from Google Drive and load it with numpy. """
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, filename, quiet=False)
    return np.load(filename)


# Assuming you have a file_id for test queries
TEST_QUERIES_ID = "1JWALu22TVYqsjgZVu3-osV4MkHHSzTpm"
DOCS_ID = "1k-1lwZG6j0tRN1w-fdfF7XL3TAeP06Dj"
DOC_EMB_ID = "1ZWMQnVRFAwvt7RFW5lPpqXs6Cox8tXLO"

documents = download_and_load_json(DOCS_ID, 'docs_processed.json')
queries_test = download_and_load_json(TEST_QUERIES_ID, 'queries_test.json')
doc_embeddings = download_and_load_npz(DOC_EMB_ID, 'embeddings.npz')['doc_embeddings']

doc_id_to_idx = {doc['id']: i for i, doc in enumerate(documents)}
doc_idx_to_id = {i: doc['id'] for i, doc in enumerate(documents)}

AttributeError: partially initialized module 'torch' has no attribute '_subclasses' (most likely due to a circular import)

In [44]:
# Prepare test query contents
# Apply same preprocessing as training queries

def merge_query_fields(query):
    """ Merge query fields into content. """
    parts = []

    if query.get('title') and query['title'].strip():
        parts.append(query['title'].strip())

    if query.get('text') and query['text'].strip():
        parts.append(query['text'].strip())

    if query.get('tags') and isinstance(query['tags'], list):
        valid_tags = [tag.strip() for tag in query['tags'] if tag and tag.strip()]
        if valid_tags:
            parts.append(' '.join(valid_tags))

    return ' '.join(parts) if parts else query.get('text', '')



for query in queries_test:
    query['content'] = merge_query_fields(query)  # Use same function as before

query_test_contents = [q['content'] for q in queries_test]

print(f"Preprocessed {len(query_test_contents)} test queries")


Preprocessed 141 test queries


In [45]:
# Generate embeddings for test queries
# (Using embeddings since they performed best)
print("\nGenerating embeddings for test queries...")

# Load the same model used for training
model = SentenceTransformer('all-MiniLM-L6-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

query_test_embeddings = model.encode(
    query_test_contents,
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True,
    device=device
)

print(f"Test query embeddings shape : {query_test_embeddings.shape}")



Generating embeddings for test queries...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Test query embeddings shape : (141, 384)


In [46]:
# Retrieve top-k documents for each test query
k = 100  # Same k as used in evaluation

print(f"Retrieving top-{k} documents for each test query :")

topk_indices_test = []
topk_scores_test = []

# Process in batches for memory efficiency
batch_size_queries = 10

for i in tqdm(range(0, len(query_test_embeddings), batch_size_queries),
              desc="Processing test queries"):
    batch_end = min(i + batch_size_queries, len(query_test_embeddings))
    query_batch = query_test_embeddings[i:batch_end]

    # Compute similarities against document embeddings
    similarities = cosine_similarity(query_batch, doc_embeddings)

    for sim_scores in similarities:
        top_k_idx = np.argsort(sim_scores)[::-1][:k]
        top_k_scores = sim_scores[top_k_idx]

        topk_indices_test.append(top_k_idx)
        topk_scores_test.append(top_k_scores)

topk_indices_test = np.array(topk_indices_test)
topk_scores_test = np.array(topk_scores_test)

print(f"\nRetrieval complete")
print(f"  Shape: {topk_indices_test.shape}")


Retrieving top-100 documents for each test query :


Processing test queries: 100%|██████████| 15/15 [00:08<00:00,  1.73it/s]


Retrieval complete
  Shape: (141, 100)





In [47]:
print("Generating Submission CSV")

submission_filename = 'solutions.csv'

with open(submission_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write header
    writer.writerow(['query_id', 'relevant_doc_ids', 'category'])

    # Write predictions for each test query
    for query_idx, query in enumerate(queries_test):
        # Get retrieved document IDs
        retrieved_indices = topk_indices_test[query_idx]
        retrieved_doc_ids = [doc_idx_to_id[idx] for idx in retrieved_indices]

        # Convert list to JSON string format
        retrieved_doc_str = json.dumps(retrieved_doc_ids)

        # Phase 1: category is empty string
        category = ""

        # Write row
        writer.writerow([query['id'], retrieved_doc_str, category])

print(f"\nSubmission file created: {submission_filename}")
print(f"  Total queries: {len(queries_test)}")
print(f"  Documents per query: {k}")


Generating Submission CSV

Submission file created: solutions.csv
  Total queries: 141
  Documents per query: 100


In [48]:
print("Validating Submission Format")

# Read back and validate
with open(submission_filename, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    rows = list(reader)

print(f"\nSubmission has {len(rows)} rows")

# Check first few rows
print("\nFirst 3 rows:")
for i, row in enumerate(rows[:3], 1):
    query_id = row['query_id']
    relevant_ids = json.loads(row['relevant_doc_ids'])
    category = row['category']

    print(f"  Query {i}:")
    print(f"    ID: {query_id}")
    print(f"    Num docs: {len(relevant_ids)}")
    print(f"    First 3 docs: {relevant_ids[:3]}")
    print(f"    Category: '{category}' (empty for Phase 1)")

# Validate format
errors = []
for i, row in enumerate(rows, 1):
    # Check required columns
    if 'queryID' not in row or 'relevantIDs' not in row or 'category' not in row:
        errors.append(f"Row {i} : Missing required columns")
        continue

    # Check relevantIDs is valid JSON
    try:
        doc_ids = json.loads(row['relevantIDs'])
        if not isinstance(doc_ids, list):
            errors.append(f"Row {i} : relevantIDs is not a list")
    except json.JSONDecodeError:
        errors.append(f"Row {i} : relevantIDs is not valid JSON")

    # Check category is empty for Phase 1
    if row['category'] != "":
        errors.append(f"Row {i} : category should be empty for Phase 1")

if errors:
    print(f"\nFound {len(errors)} validation errors:")
    for error in errors[:10]:  # Show first 10 errors
        print(f"  - {error}")
else:
    print("\nSubmission file is valid")
    print(f"   Ready to upload to Kaggle")


Validating Submission Format

Submission has 141 rows

First 3 rows:
  Query 1:
    ID: 4ffe16bc-5235-418d-9bf3-22d1f2c5796e_145437
    Num docs: 100
    First 3 docs: ['66e23752-4a83-40b9-a86f-56b00315f908_229423', 'bb3946aa-b30f-434a-a261-9c49ca4bb415_220048', 'eea32dee-6c8b-4ae5-882c-66fe3ae6c2c7_213799']
    Category: '' (empty for Phase 1)
  Query 2:
    ID: 1bb2bb20-7f45-4dcf-a94a-420c454f87b8_56473
    Num docs: 100
    First 3 docs: ['ff80cc70-4393-44de-a57c-e0323838616b_139126', '87b74f66-8610-4c15-b383-be713e20bd62_72231', '0a48b7e7-fc35-4cb3-868b-d31268e22951_146153']
    Category: '' (empty for Phase 1)
  Query 3:
    ID: 6a9a342c-1275-4bb3-a818-8bcce53fac4f_34507
    Num docs: 100
    First 3 docs: ['43fa6e5a-6ad6-4701-ba44-68b513409ffc_17053', 'c0f28c96-6697-4941-99e1-e015509eaf07_57255', '1d571334-0977-4506-bb3c-43c0c100f116_28030']
    Category: '' (empty for Phase 1)

Submission file is valid
   Ready to upload to Kaggle


In [49]:
print("Submission Statistics")

# Calculate mean score across test queries
mean_score_top1 = np.mean(topk_scores_test[:, 0])
mean_score_top10 = np.mean(topk_scores_test[:, :10])

print(f"\nRetrieval scores (cosine similarity) :")
print(f"  Mean top-1 score : {mean_score_top1:.4f}")
print(f"  Mean top-10 score : {np.mean(topk_scores_test[:, :10]):.4f}")
print(f"  Mean top-100 score : {np.mean(topk_scores_test):.4f}")

print(f"\nScore distribution (top-1) :")
print(f"  Min : {np.min(topk_scores_test[:, 0]):.4f}")
print(f"  25th percentile : {np.percentile(topk_scores_test[:, 0], 25):.4f}")
print(f"  Median : {np.median(topk_scores_test[:, 0]):.4f}")
print(f"  75th percentile : {np.percentile(topk_scores_test[:, 0], 75):.4f}")
print(f"  Max : {np.max(topk_scores_test[:, 0]):.4f}")


Submission Statistics

Retrieval scores (cosine similarity) :
  Mean top-1 score : 0.6248
  Mean top-10 score : 0.5596
  Mean top-100 score : 0.4717

Score distribution (top-1) :
  Min : 0.4790
  25th percentile : 0.5719
  Median : 0.6190
  75th percentile : 0.6665
  Max : 0.8090
