In [2]:
pip install trec-car-tools

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import trec_car.read_data as trec_car
import csv

In [4]:
input_path = "fold-0-train.pages.cbor-outlines.cbor"

with open(input_path, 'rb') as f:
    for page in trec_car.iter_outlines(f):
        print("Page:", page.page_name)
        for section_path in page.flat_headings_list():
            # Chaque section_path est déjà une liste de SectionHeading
            heading_text = " / ".join([h.heading for h in section_path])
            print("   ↳ Subtopic:", heading_text)

Page: Chocolate
   ↳ Subtopic: Etymology
   ↳ Subtopic: History
   ↳ Subtopic: History / Mesoamerican usage
   ↳ Subtopic: History / European adaptation
   ↳ Subtopic: History / Storage
   ↳ Subtopic: Nutrition and research
   ↳ Subtopic: Nutrition and research / Nutrition
   ↳ Subtopic: Nutrition and research / Research
   ↳ Subtopic: Labeling
   ↳ Subtopic: Industry
   ↳ Subtopic: Industry / Manufacturers
   ↳ Subtopic: Industry / Human trafficking of child labourers
   ↳ Subtopic: Industry / Fair trade
   ↳ Subtopic: Usage and consumption
   ↳ Subtopic: Popular culture
   ↳ Subtopic: Popular culture / Religious and cultural links
   ↳ Subtopic: Popular culture / Books and film
Page: Heavy water
   ↳ Subtopic: Explanation
   ↳ Subtopic: Other heavy forms of water
   ↳ Subtopic: Other heavy forms of water / Semiheavy water
   ↳ Subtopic: Other heavy forms of water / Heavy-oxygen water
   ↳ Subtopic: Other heavy forms of water / Tritiated water
   ↳ Subtopic: Physical properties
   ↳ S

In [5]:
output_path = "trec_car_queries.tsv"

with open(input_path, 'rb') as f, open(output_path, 'w', encoding='utf-8', newline='') as out_file:
    writer = csv.writer(out_file, delimiter='\t')
    writer.writerow(['page_title', 'query_id', 'query_text'])

    query_id_counter = 0

    for page in trec_car.iter_outlines(f):
        page_title = page.page_name

        for section_path in page.flat_headings_list():
            heading_text = " / ".join([h.heading for h in section_path])
            query_text = f"{page_title} / {heading_text}"
            query_id = f"query_{query_id_counter}"
            query_id_counter += 1

            writer.writerow([page_title, query_id, query_text])


In [6]:

import pandas as pd
df = pd.read_csv("trec_car_queries.tsv", sep="\t")
print(df.head())

import pickle
# Conversion des queries
queries = {row['query_id']: row['query_text'] for _, row in df.iterrows()}
pickle.dump(queries, open("queries.pkl", "wb"))

  page_title query_id                                 query_text
0  Chocolate  query_0                      Chocolate / Etymology
1  Chocolate  query_1                        Chocolate / History
2  Chocolate  query_2   Chocolate / History / Mesoamerican usage
3  Chocolate  query_3  Chocolate / History / European adaptation
4  Chocolate  query_4              Chocolate / History / Storage


In [8]:
import pickle
import trec_car.read_data as trec_car

docs = {}
with open("fold-0-train.pages.cbor-paragraphs.cbor", 'rb') as f:
    for para in trec_car.iter_paragraphs(f):
        docs[para.para_id] = para.get_text()

with open("documents.pkl", "wb") as f:
    pickle.dump(docs, f)

In [9]:
relevances = {}

with open("fold-0-train.pages.cbor-hierarchical.qrels", "r") as f:
    for line in f:
        qid, _, docid, rel = line.strip().split()
        rel = int(rel)
        if qid not in relevances:
            relevances[qid] = {}
        relevances[qid][docid] = rel

with open("relevances.pkl", "wb") as f:
    pickle.dump(relevances, f)

In [10]:
import pickle

queries = pickle.load(open("queries.pkl", "rb"))
docs = pickle.load(open("documents.pkl", "rb"))
relevances = pickle.load(open("relevances.pkl", "rb"))

# Prend un query_id arbitraire
some_query = next(iter(relevances.keys()))

# Vérifie qu'il est bien dans queries.pkl
assert some_query in queries, "Query ID not found in queries.pkl"

# Prend un doc_id arbitraire lié à cette requête
some_doc = next(iter(relevances[some_query].keys()))

# Vérifie qu'il est bien dans documents.pkl
assert some_doc in docs, "Doc ID not found in documents.pkl"

AssertionError: Query ID not found in queries.pkl

In [20]:
import pickle

# Charger l'ancien fichier queries.pkl
with open("queries.pkl", "rb") as f:
    old_queries = pickle.load(f)

new_queries = {}

for qid, text in old_queries.items():
    parts = text.split(" / ")
    title = text  # texte complet de la requête
    root = parts[0]  # premier niveau = racine
    headings = tuple(parts[1:]) if len(parts) > 1 else ()
    new_queries[qid] = (title, root, headings)

# Sauvegarder le nouveau fichier uniformisé
with open("queries_uniform.pkl", "wb") as f:
    pickle.dump(new_queries, f)

print(f"✅ Converti {len(new_queries)} requêtes au format tuple")


✅ Converti 477 requêtes au format tuple


In [21]:
with open("queries.pkl", "rb") as f:
    queries = pickle.load(f)
print(queries)

{'query_0': 'Chocolate / Etymology', 'query_1': 'Chocolate / History', 'query_2': 'Chocolate / History / Mesoamerican usage', 'query_3': 'Chocolate / History / European adaptation', 'query_4': 'Chocolate / History / Storage', 'query_5': 'Chocolate / Nutrition and research', 'query_6': 'Chocolate / Nutrition and research / Nutrition', 'query_7': 'Chocolate / Nutrition and research / Research', 'query_8': 'Chocolate / Labeling', 'query_9': 'Chocolate / Industry', 'query_10': 'Chocolate / Industry / Manufacturers', 'query_11': 'Chocolate / Industry / Human trafficking of child labourers', 'query_12': 'Chocolate / Industry / Fair trade', 'query_13': 'Chocolate / Usage and consumption', 'query_14': 'Chocolate / Popular culture', 'query_15': 'Chocolate / Popular culture / Religious and cultural links', 'query_16': 'Chocolate / Popular culture / Books and film', 'query_17': 'Heavy water / Explanation', 'query_18': 'Heavy water / Other heavy forms of water', 'query_19': 'Heavy water / Other he

In [22]:
import pickle

with open("queries.pkl", "rb") as f:
    old_queries = pickle.load(f)

new_queries = {}

for qid, text in old_queries.items():
    parts = text.split(" / ")
    title = text
    root = parts[0]
    headings = tuple(parts[1:]) if len(parts) > 1 else ()
    new_queries[qid] = (title, root, headings)

with open("queries_uniform.pkl", "wb") as f:
    pickle.dump(new_queries, f)

print(f"✅ Converted {len(new_queries)} queries to uniform tuple format")


✅ Converted 477 queries to uniform tuple format


In [4]:
with open("queries.pkl", "rb") as f:
    queries = pickle.load(f)
print(queries)

{'enwiki:Chocolate/Etymology': ('Chocolate / Etymology', 'Chocolate', ('Etymology',)), 'enwiki:Chocolate/History': ('Chocolate / History', 'Chocolate', ('History',)), 'enwiki:Chocolate/History/Mesoamerican%20usage': ('Chocolate / History / Mesoamerican usage', 'Chocolate', ('History', 'Mesoamerican usage')), 'enwiki:Chocolate/History/European%20adaptation': ('Chocolate / History / European adaptation', 'Chocolate', ('History', 'European adaptation')), 'enwiki:Chocolate/History/Storage': ('Chocolate / History / Storage', 'Chocolate', ('History', 'Storage')), 'enwiki:Chocolate/Nutrition%20and%20research': ('Chocolate / Nutrition and research', 'Chocolate', ('Nutrition and research',)), 'enwiki:Chocolate/Nutrition%20and%20research/Nutrition': ('Chocolate / Nutrition and research / Nutrition', 'Chocolate', ('Nutrition and research', 'Nutrition')), 'enwiki:Chocolate/Nutrition%20and%20research/Research': ('Chocolate / Nutrition and research / Research', 'Chocolate', ('Nutrition and research'

In [16]:
import pickle

# Charger l'ancien queries.pkl
with open("queries.pkl", "rb") as f:
    old_queries = pickle.load(f)

new_queries = {}

for qid, text in old_queries.items():
    # Exemple : "Chocolate / History / Mesoamerican usage"
    parts = text.split(" / ")
    title = text  # ou reconstruire proprement si besoin
    root = parts[0]  # la racine
    headings = tuple(parts[1:]) if len(parts) > 1 else ()
    new_queries[qid] = (title, root, headings)

# Sauvegarder le nouveau queries.pkl
with open("queries_uniform.pkl", "wb") as f:
    pickle.dump(new_queries, f)


In [18]:
import pickle

with open("relevances.pkl", "rb") as f:
    queries = pickle.load(f)
print(queries)

{('enwiki:Allergy', '3d0a5713204ca4abd0326869d84fff2b18255165'): 1, ('enwiki:Allergy', 'a32fc79ef3998859b8d3596ff7d26c3b1eb568e3'): 1, ('enwiki:Allergy', 'b8fc3331751378d08a7e7cb25621c32c792f3668'): 1, ('enwiki:Allergy', 'e17ef18adc53a7da41243e767405df124f641bc9'): 1, ('enwiki:Allergy/Cause', 'c7740c13c398b2c6766f4e7aeb740cfcc2d3d366'): 1, ('enwiki:Allergy/Cause/Foods', '06ba0f80019b6cbb9c0ddd1f59bde51fdee4c487'): 1, ('enwiki:Allergy/Cause/Foods', '2d8da4a77d441ed98d3b2eceefa64fbf3babd6e0'): 1, ('enwiki:Allergy/Cause/Foods', '4c335b90fb316293a709ce6c718381f6972257ed'): 1, ('enwiki:Allergy/Cause/Foods', '9325185c1164bc034f4570c21a98c741cb01b7c0'): 1, ('enwiki:Allergy/Cause/Foods', 'e6b7e72d3086d8c76644cd8847bc80ab90f891fc'): 1, ('enwiki:Allergy/Cause/Foods', 'f4f8d0e0001d33103f6bc4c76c585dfa79c869c1'): 1, ('enwiki:Allergy/Cause/Genetics', '0a6ee7687bca470212a654750981c2c4c7c4ea88'): 1, ('enwiki:Allergy/Cause/Genetics', 'a051d116010bcfc9ae4d05d789e5ddbcbe9996f1'): 1, ('enwiki:Allergy/Cau

In [19]:
import pickle

with open("documents.pkl", "rb") as f:
    queries = pickle.load(f)
print(queries)



In [3]:
import pickle

queries = pickle.load(open("queries.pkl", "rb"))
new_queries = {}

for qid, (title, root, headings) in queries.items():
    # Créer la clé compatible avec relevances.pkl : "enwiki:Root/Heading1/Heading2..."
    path = "/".join([root] + list(headings))  # concatène la hiérarchie
    wiki_key = "enwiki:" + path.replace(" ", "%20")  # encode espaces en %20
    new_queries[wiki_key] = (title, root, headings)

pickle.dump(new_queries, open("queries_fixed.pkl", "wb"))

print(f"✅ Généré {len(new_queries)} requêtes avec clés harmonisées.")


✅ Généré 477 requêtes avec clés harmonisées.
