# 01_data_preprocessing.ipynb

**Data Loading & Preprocessing (TEI <text> extraction)**

Parses each TEI XML and extracts only the content of the `<text>` element.

In [None]:
from pathlib import Path

def setup_project_paths():
    current_dir = Path().cwd()
    base_path = current_dir.parent if current_dir.name == 'codigo' else current_dir
    tei_dir = base_path / 'corpus' / 'tei'
    output_dir = base_path / 'resultados' / 'computational-analysis' / 'corpus_summary' / 'csv'
    return tei_dir, output_dir

TEI_DIR, OUTPUT_DIR = setup_project_paths()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
import pandas as pd
from lxml import etree
import re

records = []
for xml_file in sorted(TEI_DIR.glob("*.xml")):
    tree = etree.parse(str(xml_file))
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    # Extract all text within <text> element
    texts = tree.xpath('//tei:text//text()', namespaces=ns)
    # Join and normalize whitespace
    full_text = ' '.join(texts)
    full_text = re.sub(r"\s+", ' ', full_text).strip()
    records.append({
        'filename': xml_file.name,
        'text': full_text
    })

df = pd.DataFrame(records)
# Save to CSV
output_path = OUTPUT_DIR / 'raw_texts.csv'
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"Extracted <text> content from {len(df)} files to {output_path}")
df.head()