In [1]:
!pip -q install pymupdf tqdm

In [3]:
from pathlib import Path
import fitz  # pymupdf
import re
from tqdm import tqdm

RAW_DIR = Path("data/raw")
EXTRACT_DIR = Path("data/extracted_text")
CLEAN_DIR = Path("data/cleaned_text")

EXTRACT_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

pdf_files = sorted(RAW_DIR.glob("*.pdf"))
pdf_files


[PosixPath('data/raw/GEVO2023.pdf'),
 PosixPath('data/raw/GlobalEVOutlook2024.pdf'),
 PosixPath('data/raw/GlobalEVOutlook2025.pdf'),
 PosixPath('data/raw/GlobalElectricVehicleOutlook2022.pdf')]

In [4]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    doc = fitz.open(pdf_path)
    pages_text = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")  # best baseline extraction
        if text:
            pages_text.append(text)
    doc.close()
    return "\n\n".join(pages_text)


for pdf_path in tqdm(pdf_files, desc="Extracting PDFs"):
    raw_text = extract_text_from_pdf(pdf_path)
    out_path = EXTRACT_DIR / f"{pdf_path.stem}_raw.txt"
    out_path.write_text(raw_text, encoding="utf-8")

print("✅ Extraction complete. Files saved in:", EXTRACT_DIR)


Extracting PDFs: 100%|██████████| 4/4 [00:01<00:00,  2.51it/s]

✅ Extraction complete. Files saved in: data/extracted_text





In [5]:
def clean_text(text: str) -> str:
    # Normalize newlines
    text = text.replace("\r", "\n")

    # Remove hyphenation at line breaks: "electri-\nfication" → "electrification"
    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)

    # Replace multiple newlines with max two (paragraph spacing)
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Replace multiple spaces/tabs
    text = re.sub(r"[ \t]{2,}", " ", text)

    # Remove common artifacts
    text = text.replace("© OECD/IEA", "")
    text = text.replace("INTERNATIONAL ENERGY AGENCY", "")

    # Remove lone page numbers (lines with only digits)
    text = re.sub(r"\n\s*\d+\s*\n", "\n", text)

    # Trim
    return text.strip()


In [6]:
raw_txt_files = sorted(EXTRACT_DIR.glob("*_raw.txt"))

for raw_path in tqdm(raw_txt_files, desc="Cleaning extracted text"):
    raw_text = raw_path.read_text(encoding="utf-8", errors="ignore")
    cleaned = clean_text(raw_text)

    out_path = CLEAN_DIR / raw_path.name.replace("_raw.txt", "_clean.txt")
    out_path.write_text(cleaned, encoding="utf-8")

print("✅ Cleaning complete. Clean files saved in:", CLEAN_DIR)


Cleaning extracted text: 100%|██████████| 4/4 [00:00<00:00, 50.23it/s]

✅ Cleaning complete. Clean files saved in: data/cleaned_text





In [7]:
for f in sorted(CLEAN_DIR.glob("*_clean.txt")):
    print(f.name, "→", round(f.stat().st_size/1024/1024, 2), "MB")

sample_file = sorted(CLEAN_DIR.glob("*_clean.txt"))[0]
print(sample_file.name)
print("-"*80)
print(sample_file.read_text(encoding="utf-8")[:600])


GEVO2023_clean.txt → 0.33 MB
GlobalEVOutlook2024_clean.txt → 0.39 MB
GlobalEVOutlook2025_clean.txt → 0.4 MB
GlobalElectricVehicleOutlook2022_clean.txt → 0.4 MB
GEVO2023_clean.txt
--------------------------------------------------------------------------------
Global EV 
Outlook 2023
Catching up with climate ambitions

The IEA examines the 
full spectrum 
of energy issues 
including oil, gas and 
coal supply and 
demand, renewable 
energy technologies, 
electricity markets, 
energy efficiency, 
access to energy, 
demand side 
management and 
much more. Through 
its work, the IEA 
advocates policies that 
will enhance the 
reliability, affordability 
and sustainability of 
energy in its 
31 member countries, 
11 association countries 
and beyond.
This publication and any 
map included herein are 
without prejudice to the 
status of or sovereignty ove


In [8]:
merged = []
for f in sorted(CLEAN_DIR.glob("*_clean.txt")):
    merged.append(f"\n\n### SOURCE: {f.stem}\n\n")
    merged.append(f.read_text(encoding="utf-8"))

merged_text = "".join(merged)

merged_path = CLEAN_DIR / "GLOBAL_EV_OUTLOOK_2022_2024_MERGED.txt"
merged_path.write_text(merged_text, encoding="utf-8")

print("✅ Merged corpus saved to:", merged_path)


✅ Merged corpus saved to: data/cleaned_text/GLOBAL_EV_OUTLOOK_2022_2024_MERGED.txt
