# EraEx: Combined CSV Ingestion (Local)

This notebook reads **BOTH** CSV files and converts them to Parquet:
1. `dataset.csv` (existing data)
2. `ndjson_converted.csv` (from notebook 01)

In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Local setup - no Google Drive mount needed
from pathlib import Path
import polars as pl
from datetime import date
import os

# Current directory is 'notebooks', so project root is parent
PROJECT_DIR = Path.cwd().parent

RAW_DIR = PROJECT_DIR / 'data' / 'raw'
PROCESSED_DIR = PROJECT_DIR / 'data' / 'processed'

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

CSV_FILES = [
    RAW_DIR / 'dataset.csv',
    RAW_DIR / 'ndjson_converted.csv'
]

print('CSV files to process:')
for f in CSV_FILES:
    if f.exists():
        print(f'  ✓ {f.name} ({f.stat().st_size / 1e9:.2f} GB)')
    else:
        print(f'  ✗ {f.name} (NOT FOUND)')

CSV files to process:
  ✓ dataset.csv (21.99 GB)
  ✓ ndjson_converted.csv (5.49 GB)


In [3]:
COLUMN_MAPPING = {
    'id': 'track_id',
    'soundcloud_id': 'track_id',
    'user': 'artist',
    'username': 'artist',
    'tag_list': 'tags',
    'plays': 'playback_count',
    'url': 'permalink_url',
    'date': 'created_at',
}

In [4]:
def ingest_csv(csv_path: Path, batch_counter: int = 0):
    if not csv_path.exists():
        print(f'Skipping {csv_path.name} (not found)')
        return batch_counter, {}
    
    print(f'\nProcessing: {csv_path.name}')
    
    stats = {year: 0 for year in YEAR_RANGE}
    
    reader = pl.read_csv_batched(
        csv_path,
        batch_size=500000,
        ignore_errors=True,
        truncate_ragged_lines=True,
        infer_schema_length=10000
    )
    
    while True:
        batches = reader.next_batches(1)
        if not batches:
            break
        
        df = batches[0]
        
        rename_map = {}
        for old_col in df.columns:
            old_lower = old_col.lower().strip()
            if old_lower in COLUMN_MAPPING:
                rename_map[old_col] = COLUMN_MAPPING[old_lower]
        if rename_map:
            df = df.rename(rename_map)
        
        if 'track_id' not in df.columns:
            if 'id' in df.columns:
                df = df.rename({'id': 'track_id'})
        
        if 'track_id' in df.columns:
            df = df.with_columns([pl.col('track_id').cast(pl.Utf8)])
        
        if 'year' not in df.columns and 'created_at' in df.columns:
            df = df.with_columns([
                pl.col('created_at').cast(pl.Utf8).str.slice(0, 4).cast(pl.Int32, strict=False).alias('year')
            ])
        
        df = df.with_columns([pl.lit(date.today()).alias('ingest_date')])
        
        for year in YEAR_RANGE:
            year_df = df.filter(pl.col('year') == year)
            if year_df.height == 0:
                continue
            
            stats[year] += year_df.height
            
            year_dir = PROCESSED_DIR / f'year={year}'
            year_dir.mkdir(parents=True, exist_ok=True)
            
            batch_counter += 1
            out_path = year_dir / f'batch_{batch_counter:05d}.parquet'
            
            year_df = year_df.with_columns([pl.col('ingest_date').cast(pl.Date)])
            year_df.write_parquet(out_path)
        
        total_so_far = sum(stats.values())
        if total_so_far % 500000 == 0:
            print(f'  Processed: {total_so_far:,} rows')
    
    return batch_counter, stats

In [5]:
batch_counter = 0
all_stats = {}

for csv_file in CSV_FILES:
    batch_counter, stats = ingest_csv(csv_file, batch_counter)
    all_stats[csv_file.name] = stats
    print(f"  Total: {sum(stats.values()):,} rows")


Processing: dataset.csv
  Total: 40,828,069 rows

Processing: ndjson_converted.csv
  Processed: 500,000 rows
  Processed: 1,000,000 rows
  Processed: 1,500,000 rows
  Processed: 2,000,000 rows
  Processed: 2,500,000 rows
  Processed: 3,000,000 rows
  Processed: 3,500,000 rows
  Processed: 4,000,000 rows
  Processed: 4,500,000 rows
  Processed: 5,000,000 rows
  Processed: 5,500,000 rows
  Processed: 6,000,000 rows
  Processed: 6,500,000 rows
  Processed: 7,000,000 rows
  Processed: 7,500,000 rows
  Processed: 8,000,000 rows
  Processed: 8,500,000 rows
  Processed: 9,000,000 rows
  Processed: 9,500,000 rows
  Processed: 10,000,000 rows
  Processed: 10,500,000 rows
  Processed: 11,000,000 rows
  Processed: 11,500,000 rows
  Processed: 12,000,000 rows
  Processed: 12,500,000 rows
  Processed: 13,000,000 rows
  Processed: 13,500,000 rows
  Processed: 14,000,000 rows
  Processed: 14,500,000 rows
  Processed: 15,000,000 rows
  Processed: 15,500,000 rows
  Processed: 16,000,000 rows
  Process

In [6]:
print('\n' + '=' * 50)
print('INGEST COMPLETE')
print('=' * 50)

grand_total = 0
for year_dir in sorted(PROCESSED_DIR.glob('year=*')):
    parquet_files = list(year_dir.glob('*.parquet'))
    total_rows = sum(pl.scan_parquet(f).select(pl.count()).collect().item() for f in parquet_files)
    grand_total += total_rows
    print(f'{year_dir.name}: {total_rows:,} rows in {len(parquet_files)} files')

print(f'\nGrand Total: {grand_total:,} rows')


INGEST COMPLETE


(Deprecated in version 0.20.5)
  total_rows = sum(pl.scan_parquet(f).select(pl.count()).collect().item() for f in parquet_files)


year=2012: 18,246,291 rows in 172 files
year=2013: 16,636,024 rows in 174 files
year=2014: 6,546,780 rows in 139 files
year=2015: 6,014,630 rows in 139 files
year=2016: 5,227,711 rows in 139 files
year=2017: 4,216,928 rows in 139 files
year=2018: 4,657,187 rows in 139 files

Grand Total: 61,545,551 rows
