In [1]:
import json
import sys
import heapq
import math
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np

In [2]:
# Ensure repo root is on sys.path (notebook usually runs from notebooks/).
REPO_ROOT = Path.cwd().resolve()
if not (REPO_ROOT / "src").is_dir() and (REPO_ROOT.parent / "src").is_dir():
    REPO_ROOT = REPO_ROOT.parent
sys.path.insert(0, str(REPO_ROOT))

from src.config.paths import (
    poleval2022_pairs_path,
    poleval2022_passages_path,
    poleval2022_questions_path,
    poleval2022_subdataset_dir,
)

def read_jsonl(path: Path, max_rows: int | None = None) -> pd.DataFrame:
    rows: list[dict] = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_rows is not None and i >= max_rows:
                break
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return pd.DataFrame(rows)

In [3]:
dataset_id = "piotr-rybak__poleval2022-passage-retrieval-dataset"
subdataset = "wiki-trivia"
split = "train"  # "train" or "test"

subdataset_dir = poleval2022_subdataset_dir(dataset_id, subdataset)
questions_path = poleval2022_questions_path(dataset_id, subdataset, split)
pairs_path = poleval2022_pairs_path(dataset_id, subdataset, split=split)
passages_path = poleval2022_passages_path(dataset_id, subdataset)

print("Subdataset dir:", subdataset_dir)
print("Questions:", questions_path)
print("Pairs:", pairs_path)
print("Passages:", passages_path)

questions_df = read_jsonl(questions_path)
pairs_df = pd.read_csv(pairs_path, sep="\t")
pairs_df = pairs_df.rename(columns={"question-id": "question_id", "passage-id": "passage_id"})
pairs_df["score"] = pd.to_numeric(pairs_df["score"], errors="coerce")

print("questions:", len(questions_df), "unique ids:", questions_df["id"].nunique())
print("pairs:", len(pairs_df), "unique questions:", pairs_df["question_id"].nunique(), "unique passages:", pairs_df["passage_id"].nunique())

questions_df.head(3)

Subdataset dir: /home/mateusz/dev/inl_pjatk_project/.cache/data/piotr-rybak__poleval2022-passage-retrieval-dataset/wiki-trivia
Questions: /home/mateusz/dev/inl_pjatk_project/.cache/data/piotr-rybak__poleval2022-passage-retrieval-dataset/wiki-trivia/questions-train.jl
Pairs: /home/mateusz/dev/inl_pjatk_project/.cache/data/piotr-rybak__poleval2022-passage-retrieval-dataset/wiki-trivia/pairs-train.tsv
Passages: /home/mateusz/dev/inl_pjatk_project/.cache/data/piotr-rybak__poleval2022-passage-retrieval-dataset/wiki-trivia/passages.jl
questions: 5000 unique ids: 5000
pairs: 16389 unique questions: 5000 unique passages: 15320


Unnamed: 0,id,text
0,1,Jak nazywa się pierwsza litera alfabetu grecki...
1,2,Jak nazywa się dowolny odcinek łączący dwa pun...
2,4,Czy w państwach starożytnych powoływani byli p...


In [4]:
passages_df = read_jsonl(passages_path)
mask = ~passages_df["text"].str.startswith(("REDIRECT", "PATRZ"), na=False)
passages_df = passages_df[mask].reset_index(drop=True)

In [5]:
vectorizer = TfidfVectorizer(
    min_df=5,
    max_df=0.9,
    max_features=500_000,
    dtype=np.float32
)
corpus = (passages_df["title"].fillna("") + " " + passages_df["text"].fillna(""))
passages = vectorizer.fit_transform(corpus)

In [6]:
questions_df = questions_df.set_index('id')

In [7]:
questions_df.index = questions_df.index.astype(int)

In [8]:
def topk_cosine_sparse_rows(q_vec, X, topk=5, chunk_size=50_000, assume_l2_normalized=True, eps=1e-12):
    """
    q_vec: (1, D) sparse
    X:     (N, D) sparse
    Returns: list[(score, row_index)] sorted desc
    """
    heap = []

    # only needed if not normalized
    if not assume_l2_normalized:
        q_norm = np.sqrt(q_vec.multiply(q_vec).sum()) + eps

    N = X.shape[0]
    for start in range(0, N, chunk_size):
        end = min(start + chunk_size, N)
        Xb = X[start:end]

        dots = Xb @ q_vec.T
        sims = np.asarray(dots.todense()).ravel()

        if not assume_l2_normalized:
            Xb_norm = np.sqrt(Xb.multiply(Xb).sum(axis=1)).A1 + eps
            sims = sims / (Xb_norm * q_norm)

        k_local = min(topk, sims.size)
        if k_local == 0:
            continue

        idx_local = np.argpartition(-sims, k_local - 1)[:k_local]
        for i in idx_local:
            item = (float(sims[i]), start + int(i))
            if len(heap) < topk:
                heapq.heappush(heap, item)
            else:
                heapq.heappushpop(heap, item)

    heap.sort(reverse=True)
    return heap


def retrieve_topk_tfidf(question_text, vectorizer, passages_df, passages_matrix, k=5, chunk_size=50_000):
    q_vec = vectorizer.transform([question_text])
    assume_norm = (getattr(vectorizer, "norm", None) == "l2")  # default True for TfidfVectorizer

    top = topk_cosine_sparse_rows(
        q_vec=q_vec,
        X=passages_matrix,
        topk=k,
        chunk_size=chunk_size,
        assume_l2_normalized=assume_norm,
    )

    scores = [s for s, _ in top]
    idx = [i for _, i in top]

    results = passages_df.iloc[idx].copy()
    results["score"] = scores
    return results

In [9]:
qid = 12
question_text = questions_df.loc[qid, "text"]

k = 10
results = retrieve_topk_tfidf(
    question_text=question_text,
    vectorizer=vectorizer,
    passages_df=passages_df,
    passages_matrix=passages,
    k=k,
    chunk_size=50_000,
)

print(question_text)
display(results[["id", "score", "text"]].head())

Jak nazywa się pierwiastek o symbolu Bq?


Unnamed: 0,id,score,text
69660,5406-19,0.412167,Opis ilościowy. Jednostką radioaktywności w uk...
3849400,2224431-0,0.406479,.bq – planowana domena internetowa przypisana ...
1395146,445532-3,0.390036,"Jednostki. Jednostką aktywności jest bekerel, ..."
2716764,1352589-29,0.33598,Krzyż na ogonie „P” odnosi się do krzyża w sym...
4953310,3265537-6,0.334461,Występowanie. Jednostki. Koncentracja radonu w...


In [10]:

def ndcg_at_k(pred_ids, gold_ids, k=10):
    pred_ids = list(pred_ids)[:k]

    dcg = 0.0
    for i, pid in enumerate(pred_ids, start=1):
        rel = 1.0 if pid in gold_ids else 0.0
        dcg += rel / math.log2(i + 1)

    m = min(len(gold_ids), k)
    if m == 0:
        return 0.0

    idcg = sum(1.0 / math.log2(i + 1) for i in range(1, m + 1))
    return dcg / idcg

In [11]:
k = 10
# ujednolicenie typów
pairs_df["question_id"] = pairs_df["question_id"].astype(int)
pairs_df["passage_id"] = pairs_df["passage_id"].astype(str)

# jeśli filtrowałeś passages_df (REDIRECT/PATRZ), to musisz odfiltrować goldy:
passages_df["id"] = passages_df["id"].astype(str)
available = set(passages_df["id"])
pairs_eval = pairs_df[pairs_df["passage_id"].isin(available)].copy()

gold = pairs_eval.groupby("question_id")["passage_id"].apply(set).to_dict()

# qids = sorted(list(gold.keys())[:20])
qids = sorted(gold.keys())

scores = []
total = len(qids)

for i, qid in enumerate(qids, start=1):
    remaining = total - i

    # resetujący się logging (jedna linia)
    print(f"\rEvaluating: {i}/{total} | remaining: {remaining}", end="", flush=True)

    question_text = questions_df.loc[int(qid), "text"]

    results = retrieve_topk_tfidf(
        question_text=question_text,
        vectorizer=vectorizer,
        passages_df=passages_df,
        passages_matrix=passages,
        k=k,
        chunk_size=50_000,
    )
    pred_ids = results["id"].astype(str).tolist()

    scores.append(ndcg_at_k(pred_ids, gold[qid], k=k))

print()  # newline po progressie
print("mean NDCG@10 on train:", float(np.mean(scores)))
print("questions evaluated:", len(scores))

Evaluating: 40/5000 | remaining: 4960

KeyboardInterrupt: 