# CSV vs Parquet Row Count Validation

This notebook validates that the merged Parquet contains all rows from the raw CSV files by comparing:
- Total rows across all CSVs in `data/raw/openalex_data/`
- Total rows in `data/processed/openalex_merged.csv`
- Total rows in `data/processed/openalex_merged.parquet`

It uses robust, streaming-friendly methods suitable for large files.



In [None]:
from __future__ import annotations
import os
import csv
from pathlib import Path
from typing import Iterable, List, Tuple

RAW_DIR = Path('/Users/yann.jy/InvisibleResearch/data/raw/openalex_data')
MERGED_CSV = Path('/Users/yann.jy/InvisibleResearch/data/processed/openalex_merged.csv')
MERGED_PARQUET = Path('/Users/yann.jy/InvisibleResearch/data/processed/openalex_merged.parquet')


def list_csv_files(root: Path) -> List[Path]:
    files: List[Path] = []
    for dirpath, _dirnames, filenames in os.walk(root):
        for fn in filenames:
            if fn.lower().endswith('.csv'):
                files.append(Path(dirpath) / fn)
    files.sort()
    return files


def count_csv_rows(file_path: Path) -> int:
    """Count data rows in a CSV (excluding header), robust to quoted newlines."""
    total = 0
    with open(file_path, 'r', encoding='utf-8', errors='replace', newline='') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True, escapechar='\\')
        _ = next(reader, None)  # skip header
        for _row in reader:
            total += 1
    return total


def count_parquet_rows(parquet_path: Path) -> int:
    """Count rows in Parquet using metadata; falls back to DuckDB if needed."""
    try:
        import pyarrow.parquet as pq  # type: ignore
        pf = pq.ParquetFile(str(parquet_path))
        return pf.metadata.num_rows
    except Exception:
        try:
            import duckdb  # type: ignore
            con = duckdb.connect()
            return con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path.as_posix()}')").fetchone()[0]
        except Exception as e2:
            raise RuntimeError(f"Failed to count Parquet rows: {e2}")



In [None]:
# Compute counts + per-file breakdown and export
raw_files = list_csv_files(RAW_DIR)

# Per-file counts
file_counts = []
raw_total = 0
for fp in raw_files:
    rows = count_csv_rows(fp)
    file_counts.append((fp.relative_to(RAW_DIR).as_posix(), rows))
    raw_total += rows

# Print aligned table
name_width = max((len(name) for name, _ in file_counts), default=10)
print(f"{'CSV File'.ljust(name_width)} | Rows")
print('-' * (name_width + 7))
for name, cnt in file_counts:
    print(f"{name.ljust(name_width)} | {cnt}")

# Export to CSV
from pathlib import Path
import csv as _csv
OUTPUT_CSV = Path('/Users/yann.jy/InvisibleResearch/outputs/reports/openalex_csv_row_counts_by_file.csv')
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
    w = _csv.writer(f)
    w.writerow(['file', 'rows'])
    for name, cnt in file_counts:
        w.writerow([name, cnt])
print(f"\nExported per-file counts to: {OUTPUT_CSV}")

# Previous totals and parity checks
merged_csv_rows = count_csv_rows(MERGED_CSV)
parquet_rows = count_parquet_rows(MERGED_PARQUET)

print('raw_csv_total_rows:', raw_total)
print('merged_csv_rows:', merged_csv_rows)
print('parquet_rows:', parquet_rows)
print('raw_vs_merged_equal:', raw_total == merged_csv_rows)
print('merged_vs_parquet_equal:', merged_csv_rows == parquet_rows)
print('raw_vs_parquet_equal:', raw_total == parquet_rows)


In [None]:
# Per-file CSV row counts breakdown and export
from __future__ import annotations
import csv
from pathlib import Path
from typing import List, Tuple

RAW_DIR = Path('/Users/yann.jy/InvisibleResearch/data/raw/openalex_data')
OUTPUT_CSV = Path('/Users/yann.jy/InvisibleResearch/outputs/reports/openalex_csv_row_counts_by_file.csv')

# Reuse count_csv_rows if already defined in previous cells; otherwise define here
try:
    count_csv_rows  # type: ignore[name-defined]
except NameError:
    def count_csv_rows(file_path: Path) -> int:
        total = 0
        with open(file_path, 'r', encoding='utf-8', errors='replace', newline='') as f:
            reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True, escapechar='\\')
            _ = next(reader, None)
            for _row in reader:
                total += 1
        return total

# Collect per-file counts
file_counts: List[Tuple[str, int]] = []
for fp in sorted(RAW_DIR.rglob('*.csv')):
    relative = fp.relative_to(RAW_DIR).as_posix()
    file_counts.append((relative, count_csv_rows(fp)))

# Print as a simple aligned table
name_width = max((len(name) for name, _ in file_counts), default=10)
print(f"{'CSV File'.ljust(name_width)} | Rows")
print('-' * (name_width + 7))
for name, cnt in file_counts:
    print(f"{name.ljust(name_width)} | {cnt}")

# Export to CSV
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
    w = csv.writer(f)
    w.writerow(['file', 'rows'])
    for name, cnt in file_counts:
        w.writerow([name, cnt])

print(f"\nExported per-file counts to: {OUTPUT_CSV}")


In [None]:
# Per-file CSV row counts breakdown and export (appended)
from pathlib import Path
import csv as _csv

RAW_DIR = Path('/Users/yann.jy/InvisibleResearch/data/raw/openalex_data')
OUTPUT_CSV = Path('/Users/yann.jy/InvisibleResearch/outputs/reports/openalex_csv_row_counts_by_file.csv')

# Build per-file counts using existing helpers
file_counts = []
for fp in list_csv_files(RAW_DIR):
    rel = fp.relative_to(RAW_DIR).as_posix()
    cnt = count_csv_rows(fp)
    file_counts.append((rel, cnt))

# Print table
name_width = max((len(n) for n, _ in file_counts), default=10)
print(f"{'CSV File'.ljust(name_width)} | Rows")
print('-' * (name_width + 7))
for name, cnt in file_counts:
    print(f"{name.ljust(name_width)} | {cnt}")

# Export
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
    w = _csv.writer(f)
    w.writerow(['file', 'rows'])
    w.writerows(file_counts)

print(f"\nExported per-file counts to: {OUTPUT_CSV}")


In [None]:
# Per-file CSV row counts breakdown and export
from __future__ import annotations
import csv
from pathlib import Path
from typing import List, Tuple

RAW_DIR = Path('/Users/yann.jy/InvisibleResearch/data/raw/openalex_data')
OUTPUT_CSV = Path('/Users/yann.jy/InvisibleResearch/outputs/reports/openalex_csv_row_counts_by_file.csv')

# Reuse count_csv_rows if already defined in previous cells; otherwise define here
try:
    count_csv_rows  # type: ignore[name-defined]
except NameError:
    def count_csv_rows(file_path: Path) -> int:
        total = 0
        with open(file_path, 'r', encoding='utf-8', errors='replace', newline='') as f:
            reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True, escapechar='\\')
            _ = next(reader, None)
            for _row in reader:
                total += 1
        return total

# Collect per-file counts
file_counts: List[Tuple[str, int]] = []
for fp in sorted(RAW_DIR.rglob('*.csv')):
    relative = fp.relative_to(RAW_DIR).as_posix()
    file_counts.append((relative, count_csv_rows(fp)))

# Print as a simple aligned table
name_width = max((len(name) for name, _ in file_counts), default=10)
print(f"{'CSV File'.ljust(name_width)} | Rows")
print('-' * (name_width + 7))
for name, cnt in file_counts:
    print(f"{name.ljust(name_width)} | {cnt}")

# Export to CSV
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_CSV, 'w', encoding='utf-8', newline='') as f:
    w = csv.writer(f)
    w.writerow(['file', 'rows'])
    for name, cnt in file_counts:
        w.writerow([name, cnt])

print(f"\nExported per-file counts to: {OUTPUT_CSV}")


In [None]:
# Optional: year-range check for 1925-1999 on both CSV and Parquet
from collections import Counter

def csv_year_distribution(file_path: Path, year_col: str = 'publication_year'):
    total = 0
    counter = Counter()
    with open(file_path, 'r', encoding='utf-8', errors='replace', newline='') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"', doublequote=True, escapechar='\\')
        header = next(reader)
        name_to_idx = {name.strip(): i for i, name in enumerate(header)}
        yi = name_to_idx.get(year_col)
        for row in reader:
            total += 1
            if yi is not None and yi < len(row):
                v = row[yi].strip()
                if v.isdigit():
                    y = int(v)
                    if 1900 <= y <= 2100:
                        counter[y] += 1
    return total, counter

raw1925_total, raw1925_counter = csv_year_distribution(MERGED_CSV)
print('Merged CSV 1925-1999 sum:', sum(c for y, c in raw1925_counter.items() if 1925 <= y <= 1999))

try:
    import duckdb
    con = duckdb.connect()
    pq_yr = con.execute(
        f"""
        WITH t AS (
          SELECT try_cast(publication_year AS INTEGER) AS py
          FROM read_parquet('{MERGED_PARQUET.as_posix()}')
        )
        SELECT py, COUNT(*) AS c FROM t WHERE py BETWEEN 1925 AND 1999 GROUP BY py ORDER BY py
        """
    ).fetchall()
    print('Parquet 1925-1999 sum:', sum(c for _, c in pq_yr))
except Exception as e:
    print('DuckDB not available for year-range check:', e)
