In [1]:
import importlib

import numpy as np
import pandas as pd

from src.preprocess.fasttext_vectors import load_fasttext_vectors, tokenize
from src.models.fasttext import FastTextModelSpec, load_fasttext_model
from src.config.paths import (
    poleval2022_questions_path,
    poleval2022_subdataset_dir,
)
from src.preprocess.tf_idf_vectors import read_jsonl

import src.eval.retrieval_eval as retrieval_eval
importlib.reload(retrieval_eval)

build_faiss_ivfpq_ip_index = retrieval_eval.build_faiss_ivfpq_ip_index
embed_fasttext_avg = retrieval_eval.embed_fasttext_avg
retrieve_dense_faiss_topk = retrieval_eval.retrieve_dense_faiss_topk


In [2]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s:%(name)s:%(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
    force=True,
)


In [3]:
ft = load_fasttext_vectors("wiki-trivia")

In [4]:
# Vectors are float32 memmap; estimate RAM if fully materialized
n, d = ft["vectors"].shape
print("shape:", (n, d))
print("dtype:", ft["vectors"].dtype)
print("raw size (float32):", round(n * d * 4 / 1e9, 2), "GB")


shape: (6639839, 300)
dtype: float32
raw size (float32): 7.97 GB


In [None]:
# Dense FastText retrieval via FAISS (cosine = inner product on L2-normalized vectors)

# Load the same FastText model used for passage embeddings
model = load_fasttext_model(spec=FastTextModelSpec(lang="pl", dim=300))

# Build a RAM-friendly FAISS index (IVF-PQ)
# Tune recall/speed via `index.nprobe` (higher = better recall, slower)
index = build_faiss_ivfpq_ip_index(
    passage_vectors=ft["vectors"],
    nlist=4096,
    m=30,      # 300 dims / 30 = 10 dims per subquantizer
    nbits=8,
    train_size=200_000,
    chunk_size=200_000,
)
index.nprobe = 32


INFO:src.models.fasttext:Loading fastText model from: /home/mateusz/dev/inl_pjatk_project/.cache/fasttext/cc.pl.300.bin
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.


: 

In [None]:
def retrieve_topk_fasttext(question_texts: list[str], k: int = 10) -> np.ndarray:
    Q = embed_fasttext_avg(model=model, texts=question_texts, tokenize=tokenize)
    return retrieve_dense_faiss_topk(
        index=index,
        passage_ids=ft["passage_ids"],
        query_vectors=Q,
        k=k,
    )


INFO:src.models.fasttext:Loading fastText model from: /home/mateusz/dev/inl_pjatk_project/.cache/fasttext/cc.pl.300.bin
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.


: 

In [5]:
dataset_id = "piotr-rybak__poleval2022-passage-retrieval-dataset"
subdataset = "wiki-trivia"
split = "train"  # "train" or "test"

subdataset_dir = poleval2022_subdataset_dir(dataset_id, subdataset)
questions_path = poleval2022_questions_path(dataset_id, subdataset, split)
questions_df = read_jsonl(questions_path).set_index('id')

In [6]:
# Example: top-k for a single question (FastText cosine)
k = 10
qid = 12
question_text = str(questions_df.loc[qid, "text"])

top_ids = retrieve_topk_fasttext([question_text], k=k)[0]
pd.Series(np.arange(1, k + 1), index=top_ids, name="rank")

3606778-4     1
116612-0      2
1525522-1     3
1266498-6     4
1266498-5     5
42520-9       6
3606778-0     7
4453578-0     8
9824-3        9
148099-8     10
Name: rank, dtype: int64

In [7]:
# Full evaluation + TSV (same helper as TF-IDF)
from src.eval.retrieval_eval import evaluate_and_write_submission

dataset_id = "piotr-rybak__poleval2022-passage-retrieval-dataset"
subdataset = "wiki-trivia"
split = "test"
k = 10

result = evaluate_and_write_submission(
    dataset_id=dataset_id,
    subdataset=subdataset,
    questions_split=split,
    pairs_split=split,  # set None if you want "submission only"
    k=k,
    out_path=None,  # or set your own path
    retriever=lambda texts, k: retrieve_topk_fasttext(texts, k=k),
)

print("Wrote:", result.out_path)
if result.hits_at_k is not None:
    print(f"Hits@{k}:      {result.hits_at_k:.4f}")
    print(f"Recall@{k}:    {result.recall_at_k:.4f}")
    print(f"Precision@{k}: {result.precision_at_k:.4f}")
    print(f"MRR@{k}:       {result.mrr_at_k:.4f}")
    print(f"nDCG@{k}:      {result.ndcg_at_k:.4f}")

result

INFO:src.eval.retrieval_eval:Starting retrieval: subdataset=wiki-trivia questions_split=test k=10
INFO:src.eval.retrieval_eval:Loaded 1291 questions
INFO:src.eval.retrieval_eval:Wrote TSV: /home/mateusz/dev/inl_pjatk_project/.cache/submissions/tfidf_wiki-trivia_questions-test.tsv
INFO:src.eval.retrieval_eval:Loaded relevance labels for 1291 questions
INFO:src.eval.retrieval_eval:Metrics@10: Hits=0.0294 Recall=0.0119 Precision=0.0030 MRR=0.0138 nDCG=0.0091


Wrote: /home/mateusz/dev/inl_pjatk_project/.cache/submissions/tfidf_wiki-trivia_questions-test.tsv
Hits@10:      0.0294
Recall@10:    0.0119
Precision@10: 0.0030
MRR@10:       0.0138
nDCG@10:      0.0091


EvalResult(out_path=PosixPath('/home/mateusz/dev/inl_pjatk_project/.cache/submissions/tfidf_wiki-trivia_questions-test.tsv'), k=10, n_questions=1291, hits_at_k=0.02943454686289698, recall_at_k=0.011851278079008517, precision_at_k=0.0030209140201394287, mrr_at_k=0.013778540075983918, ndcg_at_k=0.009109027092527448, n_labeled=1291)