# CSV vs Parquet Row Count Validation

This notebook validates that the merged Parquet contains all rows from the raw CSV files by comparing:
- Total rows across all CSVs in `data/raw/openalex_data/`
- Total rows in `data/processed/openalex_merged.csv`
- Total rows in `data/processed/openalex_merged.parquet`

It uses robust, streaming-friendly methods suitable for large files.



In [1]:
from __future__ import annotations
import os
import csv
from pathlib import Path
from typing import Iterable, List, Tuple

RAW_DIR = Path('/Users/yann.jy/InvisibleResearch/data/raw/openalex_data')
MERGED_CSV = Path('/Users/yann.jy/InvisibleResearch/data/processed/openalex_merged.csv')
MERGED_PARQUET = Path('/Users/yann.jy/InvisibleResearch/data/processed/openalex_merged.parquet')


def list_csv_files(root: Path) -> List[Path]:
    files: List[Path] = []
    for dirpath, _dirnames, filenames in os.walk(root):
        for fn in filenames:
            if fn.lower().endswith('.csv'):
                files.append(Path(dirpath) / fn)
    files.sort()
    return files


def count_csv_rows(file_path: Path) -> int:
    """Count data rows in a CSV (excluding header), robust to quoted newlines."""
    total = 0
    with open(file_path, 'r', encoding='utf-8', errors='replace', newline='') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True, escapechar='\\')
        _ = next(reader, None)  # skip header
        for _row in reader:
            total += 1
    return total


def count_parquet_rows(parquet_path: Path) -> int:
    """Count rows in Parquet using metadata; falls back to DuckDB if needed."""
    try:
        import pyarrow.parquet as pq  # type: ignore
        pf = pq.ParquetFile(str(parquet_path))
        return pf.metadata.num_rows
    except Exception:
        try:
            import duckdb  # type: ignore
            con = duckdb.connect()
            return con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path.as_posix()}')").fetchone()[0]
        except Exception as e2:
            raise RuntimeError(f"Failed to count Parquet rows: {e2}")



In [2]:
# Compute counts + per-file breakdown and export
raw_files = list_csv_files(RAW_DIR)

# Per-file counts
file_counts = []
raw_total = 0
for fp in raw_files:
    rows = count_csv_rows(fp)
    file_counts.append((fp.relative_to(RAW_DIR).as_posix(), rows))
    raw_total += rows

# Prepare totals
merged_csv_rows = count_csv_rows(MERGED_CSV)
parquet_rows = count_parquet_rows(MERGED_PARQUET)

# Print aligned table
name_width = max((len(name) for name, _ in file_counts), default=10)
print(f"{'CSV File'.ljust(name_width)} | Rows")
print('-' * (name_width + 7))
for name, cnt in file_counts:
    print(f"{name.ljust(name_width)} | {cnt}")

# Export to CSV
from pathlib import Path
import csv as _csv
OUTPUT_CSV = Path('/Users/yann.jy/InvisibleResearch/outputs/reports/openalex_csv_row_counts_by_file.csv')
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
    w = _csv.writer(f)
    w.writerow(['file', 'rows'])
    for name, cnt in file_counts:
        w.writerow([name, cnt])
    # Append totals
    w.writerow(['TOTAL_raw_csv_files', raw_total])
    w.writerow(['TOTAL_merged_csv', merged_csv_rows])
    w.writerow(['TOTAL_parquet', parquet_rows])
print(f"\nExported per-file counts to: {OUTPUT_CSV}")

# Previous totals and parity checks
print('raw_csv_total_rows:', raw_total)
print('merged_csv_rows:', merged_csv_rows)
print('parquet_rows:', parquet_rows)
print('raw_vs_merged_equal:', raw_total == merged_csv_rows)
print('merged_vs_parquet_equal:', merged_csv_rows == parquet_rows)
print('raw_vs_parquet_equal:', raw_total == parquet_rows)


CSV File            | Rows
--------------------------
works_1925_1999.csv | 50202
works_2000.csv      | 5935
works_2001.csv      | 6330
works_2002.csv      | 7409
works_2003.csv      | 8175
works_2004.csv      | 9280
works_2005.csv      | 10746
works_2006.csv      | 12064
works_2007.csv      | 13964
works_2008.csv      | 16663
works_2009.csv      | 19446
works_2010.csv      | 21854
works_2011.csv      | 25249
works_2012.csv      | 26399
works_2013.csv      | 27834
works_2014.csv      | 29861
works_2015.csv      | 30846
works_2016.csv      | 31598
works_2017.csv      | 31462
works_2018.csv      | 31229
works_2019.csv      | 32069
works_2020.csv      | 34761
works_2021.csv      | 28122
works_2022.csv      | 22354
works_2023.csv      | 22893
works_2024.csv      | 21750
works_2025.csv      | 12569

Exported per-file counts to: /Users/yann.jy/InvisibleResearch/outputs/reports/openalex_csv_row_counts_by_file.csv
raw_csv_total_rows: 591064
merged_csv_rows: 591064
parquet_rows: 591064
raw_vs_m