In [2]:
pip install trec-car-tools

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
import trec_car.read_data as trec_car
import csv

In [5]:
input_path = "train.pages.cbor-outlines.cbor"

with open(input_path, 'rb') as f:
    for page in trec_car.iter_outlines(f):
        print("Page:", page.page_name)
        for section_path in page.flat_headings_list():
            # Chaque section_path est déjà une liste de SectionHeading
            heading_text = " / ".join([h.heading for h in section_path])
            print("   ↳ Subtopic:", heading_text)

Page: Carbohydrate
   ↳ Subtopic: Structure
   ↳ Subtopic: Division
   ↳ Subtopic: Monosaccharides
   ↳ Subtopic: Monosaccharides / Classification of monosaccharides
   ↳ Subtopic: Monosaccharides / Ring-straight chain isomerism
   ↳ Subtopic: Monosaccharides / Use in living organisms
   ↳ Subtopic: Disaccharides
   ↳ Subtopic: Nutrition
   ↳ Subtopic: Nutrition / Classification
   ↳ Subtopic: Metabolism
   ↳ Subtopic: Metabolism / Catabolism
   ↳ Subtopic: Carbohydrate chemistry
Page: Chocolate
   ↳ Subtopic: Etymology
   ↳ Subtopic: History
   ↳ Subtopic: History / Mesoamerican usage
   ↳ Subtopic: History / European adaptation
   ↳ Subtopic: History / Storage
   ↳ Subtopic: Nutrition and research
   ↳ Subtopic: Nutrition and research / Nutrition
   ↳ Subtopic: Nutrition and research / Research
   ↳ Subtopic: Labeling
   ↳ Subtopic: Industry
   ↳ Subtopic: Industry / Manufacturers
   ↳ Subtopic: Industry / Human trafficking of child labourers
   ↳ Subtopic: Industry / Fair trade
   ↳

In [8]:
output_path = "trec_car_queries.tsv"

with open(input_path, 'rb') as f, open(output_path, 'w', encoding='utf-8', newline='') as out_file:
    writer = csv.writer(out_file, delimiter='\t')
    writer.writerow(['page_title', 'query_id', 'query_text'])

    query_id_counter = 0

    for page in trec_car.iter_outlines(f):
        page_title = page.page_name

        for section_path in page.flat_headings_list():
            heading_text = " / ".join([h.heading for h in section_path])
            query_text = f"{page_title} / {heading_text}"
            query_id = f"query_{query_id_counter}"
            query_id_counter += 1

            writer.writerow([page_title, query_id, query_text])


In [9]:

import pandas as pd
df = pd.read_csv("trec_car_queries.tsv", sep="\t")
print(df.head())

import pickle
# Conversion des queries
queries = {row['query_id']: row['query_text'] for _, row in df.iterrows()}
pickle.dump(queries, open("queries.pkl", "wb"))

     page_title query_id                                         query_text
0  Carbohydrate  query_0                           Carbohydrate / Structure
1  Carbohydrate  query_1                            Carbohydrate / Division
2  Carbohydrate  query_2                     Carbohydrate / Monosaccharides
3  Carbohydrate  query_3  Carbohydrate / Monosaccharides / Classificatio...
4  Carbohydrate  query_4  Carbohydrate / Monosaccharides / Ring-straight...


In [10]:
import pickle
import trec_car.read_data as trec_car

docs = {}
with open("train.pages.cbor-paragraphs.cbor", 'rb') as f:
    for para in trec_car.iter_paragraphs(f):
        docs[para.para_id] = para.get_text()

with open("documents.pkl", "wb") as f:
    pickle.dump(docs, f)

In [11]:
relevances = {}

with open("train.pages.cbor-hierarchical.qrels", "r") as f:
    for line in f:
        qid, _, docid, rel = line.strip().split()
        rel = int(rel)
        if qid not in relevances:
            relevances[qid] = {}
        relevances[qid][docid] = rel

with open("relevances.pkl", "wb") as f:
    pickle.dump(relevances, f)

In [12]:
import pickle

queries = pickle.load(open("queries.pkl", "rb"))
docs = pickle.load(open("documents.pkl", "rb"))
relevances = pickle.load(open("relevances.pkl", "rb"))

# Prend un query_id arbitraire
some_query = next(iter(relevances.keys()))

# Vérifie qu'il est bien dans queries.pkl
assert some_query in queries, "Query ID not found in queries.pkl"

# Prend un doc_id arbitraire lié à cette requête
some_doc = next(iter(relevances[some_query].keys()))

# Vérifie qu'il est bien dans documents.pkl
assert some_doc in docs, "Doc ID not found in documents.pkl"


AssertionError: Query ID not found in queries.pkl

In [14]:
# Charger les requêtes
with open("queries.pkl", "rb") as f:
    queries = pickle.load(f)
print(queries)

{'enwiki:Carbohydrate/Structure': ('enwiki:Carbohydrate Structure', 'enwiki:Carbohydrate', ('Structure',)), 'enwiki:Carbohydrate/Division': ('enwiki:Carbohydrate Division', 'enwiki:Carbohydrate', ('Division',)), 'enwiki:Carbohydrate/Monosaccharides': ('enwiki:Carbohydrate Monosaccharides', 'enwiki:Carbohydrate', ('Monosaccharides',)), 'enwiki:Carbohydrate/Monosaccharides/Classification%20of%20monosaccharides': ('enwiki:Carbohydrate Monosaccharides Classification of monosaccharides', 'enwiki:Carbohydrate', ('Monosaccharides', 'Classification of monosaccharides')), 'enwiki:Carbohydrate/Monosaccharides/Ring-straight%20chain%20isomerism': ('enwiki:Carbohydrate Monosaccharides Ring-straight chain isomerism', 'enwiki:Carbohydrate', ('Monosaccharides', 'Ring-straight chain isomerism')), 'enwiki:Carbohydrate/Monosaccharides/Use%20in%20living%20organisms': ('enwiki:Carbohydrate Monosaccharides Use in living organisms', 'enwiki:Carbohydrate', ('Monosaccharides', 'Use in living organisms')), 'enw

In [15]:
with open("queriesethan.pkl", "rb") as f:
    queries = pickle.load(f)
print(queries)



In [18]:
import pickle

nested_relevances = pickle.load(open("relevances.pkl", "rb"))

flat_relevances = {}
for query_id, doc_dict in nested_relevances.items():
    for doc_id, score in doc_dict.items():
        flat_relevances[(query_id, doc_id)] = score

with open("relevances.pkl", "wb") as f:
    pickle.dump(flat_relevances, f)

