In [1]:
from pathlib import Path
import re
import json
from tqdm import tqdm

CLEAN_DIR = Path("data/cleaned_text")
OUT_DIR = Path("data/chunks")

OUT_DIR.mkdir(parents=True, exist_ok=True)

MERGED_FILE = CLEAN_DIR / "GLOBAL_EV_OUTLOOK_2022_2024_MERGED.txt"
assert MERGED_FILE.exists(), "Merged file not found. Run data_ingestion.ipynb first."


In [2]:
text = MERGED_FILE.read_text(encoding="utf-8", errors="ignore")
len(text), text[:200]


(1581689,
 '\n\n### SOURCE: GEVO2023_clean\n\nGlobal EV \nOutlook 2023\nCatching up with climate ambitions\n\nThe IEA examines the \nfull spectrum \nof energy issues \nincluding oil, gas and \ncoal supply and \ndemand, renewa')

In [3]:
def split_by_source(merged_text: str):
    parts = re.split(r"\n\n### SOURCE:\s*", merged_text)
    # first element might be empty
    docs = []
    for part in parts:
        part = part.strip()
        if not part:
            continue
        # First line until newline is the source name
        first_newline = part.find("\n")
        source = part[:first_newline].strip()
        body = part[first_newline:].strip()
        docs.append((source, body))
    return docs

docs = split_by_source(text)
len(docs), docs[0][0]


(4, 'GEVO2023_clean')

In [4]:
def infer_year_from_source(source: str):
    match = re.search(r"(20\d{2})", source)
    return int(match.group(1)) if match else None

for src, _ in docs:
    print(src, "→", infer_year_from_source(src))


GEVO2023_clean → 2023
GlobalEVOutlook2024_clean → 2024
GlobalEVOutlook2025_clean → 2025
GlobalElectricVehicleOutlook2022_clean → 2022


In [5]:
CHUNK_WORDS = 850      # size per chunk (approx)
OVERLAP_WORDS = 150    # overlap to preserve continuity


In [6]:
def normalize_for_chunking(text: str) -> str:
    # remove excessive whitespace
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]{2,}", " ", text)
    return text.strip()


In [7]:
def chunk_text_by_words(text: str, chunk_words=850, overlap_words=150):
    words = text.split()
    chunks = []
    start = 0
    n = len(words)

    while start < n:
        end = min(start + chunk_words, n)
        chunk = " ".join(words[start:end])
        chunks.append(chunk)

        if end == n:
            break

        start = end - overlap_words  # overlap
        if start < 0:
            start = 0

    return chunks


In [8]:
{
  "chunk_id": "IEA_2024_000123",
  "source": "GlobalEVOutlook2024_clean",
  "year": 2024,
  "domain": "EV Outlook",
  "text": "..."
}

chunks = []

for source, body in tqdm(docs, desc="Chunking documents"):
    year = infer_year_from_source(source)
    body = normalize_for_chunking(body)

    doc_chunks = chunk_text_by_words(body, CHUNK_WORDS, OVERLAP_WORDS)

    for i, ch in enumerate(doc_chunks):
        chunks.append({
            "chunk_id": f"IEA_{year}_{i:06d}",
            "source": source,
            "year": year,
            "domain": "IEA Global EV Outlook",
            "text": ch
        })

len(chunks)


Chunking documents: 100%|██████████| 4/4 [00:00<00:00, 77.32it/s]


352

In [9]:
out_path = OUT_DIR / "iea_ev_outlook_chunks.jsonl"

with out_path.open("w", encoding="utf-8") as f:
    for obj in chunks:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print("✅ Saved chunks to:", out_path)


✅ Saved chunks to: data/chunks/iea_ev_outlook_chunks.jsonl


In [10]:
lengths = [len(c["text"].split()) for c in chunks]
print("Total chunks:", len(chunks))
print("Min words:", min(lengths))
print("Max words:", max(lengths))
print("Avg words:", sum(lengths)/len(lengths))


import random
sample = random.choice(chunks)
print(sample["chunk_id"], sample["source"])
print("-"*80)
print(sample["text"][:1200])


Total chunks: 352
Min words: 431
Max words: 850
Avg words: 846.9545454545455
IEA_2022_000030 GlobalElectricVehicleOutlook2022_clean
--------------------------------------------------------------------------------
natural gas, liquefied natural gas and biomethane), liquid biofuels, synthetic and paraffinic fuels, and liquefied petroleum gas. Austria’s 2021 Mobility Master Plan outlines targets to end the sale of conventional M/HDVs under 18 tonnes by 2030, and by 2035 for those over 18 tonnes. A total of EUR 46 million (USD 54 million) was available in to support electromobility, including EUR 60 000 (USD 70 980) offered for the purchase of eligible commercial heavy-duty ZEVs and up to EUR 130 000 (USD 153 790) for buses. EUR 46 million (USD 54 million) was available in 2021 to support electromobility, including EUR 60 000 (USD 70 980) for the purchase of eligible commercial heavy-duty ZEVs and up to EUR 130 000 for buses (USD 153 790). Spain activated EUR 400 million (USD 473 million) 