# EraEx: NDJSON â†’ CSV Conversion (Local)

This notebook reads `.ndjson.zst` files and outputs a **CSV file** for combining with your existing dataset.

**Output**: `data/raw/ndjson_converted.csv`

In [1]:
%pip install -r ../requirements.txt

Collecting zstandard (from -r ../requirements.txt (line 16))
  Downloading zstandard-0.25.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Downloading zstandard-0.25.0-cp311-cp311-win_amd64.whl (506 kB)
Installing collected packages: zstandard
Successfully installed zstandard-0.25.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Local setup - no Google Drive mount needed
from pathlib import Path
import os

# Current directory is 'notebooks', so project root is parent
PROJECT_DIR = Path.cwd().parent

RAW_DIR = PROJECT_DIR / 'data' / 'raw'
RAW_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

print(f'Project Dir: {PROJECT_DIR}')
print(f'Raw Data Dir: {RAW_DIR}')

Project Dir: c:\Users\Yabuku\Downloads\EraEx
Raw Data Dir: c:\Users\Yabuku\Downloads\EraEx\data\raw


In [3]:
ndjson_files = list(RAW_DIR.glob('*.ndjson.zst')) + list(RAW_DIR.glob('*.ndjson'))
print(f'Found {len(ndjson_files)} NDJSON file(s):')
for f in ndjson_files:
    print(f'  - {f.name} ({f.stat().st_size / 1e9:.2f} GB)')

Found 1 NDJSON file(s):
  - 1-100m.ndjson.zst (10.71 GB)


In [4]:
import json
import zstandard as zstd
import polars as pl
from datetime import date
from typing import Iterator, Dict, Any, Optional

NDJSON_FIELD_MAPPING = {
    'id': 'track_id',
    'title': 'title',
    'user.username': 'artist',
    'genre': 'genre',
    'tag_list': 'tags',
    'description': 'description',
    'playback_count': 'playback_count',
    'permalink_url': 'permalink_url',
    'created_at': 'created_at',
}

def extract_nested_field(obj: dict, path: str) -> Any:
    parts = path.split('.')
    current = obj
    for part in parts:
        if current is None:
            return None
        if isinstance(current, dict):
            current = current.get(part)
        else:
            return None
    return current

def parse_ndjson_line(line: str) -> Optional[Dict[str, Any]]:
    try:
        obj = json.loads(line)
    except json.JSONDecodeError:
        return None
    
    mapped = {}
    for source_field, target_field in NDJSON_FIELD_MAPPING.items():
        value = extract_nested_field(obj, source_field)
        mapped[target_field] = value
    
    if mapped.get('created_at'):
        try:
            mapped['year'] = int(mapped['created_at'][:4])
        except (ValueError, TypeError):
            mapped['year'] = None
    else:
        mapped['year'] = None
    
    mapped['track_id'] = str(mapped.get('track_id', '')) if mapped.get('track_id') else None
    
    return mapped

In [5]:
def stream_ndjson_zst(file_path: Path, chunk_size: int = 10000) -> Iterator[list]:
    dctx = zstd.ZstdDecompressor()
    
    with open(file_path, 'rb') as fh:
        with dctx.stream_reader(fh) as reader:
            buffer = b''
            chunk = []
            
            while True:
                data = reader.read(1024 * 1024)
                if not data:
                    break
                
                buffer += data
                lines = buffer.split(b'\n')
                buffer = lines[-1]
                
                for line in lines[:-1]:
                    if not line.strip():
                        continue
                    
                    parsed = parse_ndjson_line(line.decode('utf-8', errors='ignore'))
                    if parsed and parsed.get('year') in YEAR_RANGE:
                        chunk.append(parsed)
                    
                    if len(chunk) >= chunk_size:
                        yield chunk
                        chunk = []
            
            if buffer.strip():
                parsed = parse_ndjson_line(buffer.decode('utf-8', errors='ignore'))
                if parsed and parsed.get('year') in YEAR_RANGE:
                    chunk.append(parsed)
            
            if chunk:
                yield chunk

In [6]:
output_csv = RAW_DIR / 'ndjson_converted.csv'
first_chunk = True
total_rows = 0

if not ndjson_files:
    print('No NDJSON files found! Please place .ndjson.zst files in data/raw/')
else:
    for ndjson_file in ndjson_files:
        print(f'Processing: {ndjson_file.name}')
        
        for chunk in stream_ndjson_zst(ndjson_file, chunk_size=50000):
            df = pl.DataFrame(chunk)
            
            if first_chunk:
                df.write_csv(output_csv)
                first_chunk = False
            else:
                with open(output_csv, 'ab') as f:
                    df.write_csv(f, include_header=False)
            
            total_rows += len(chunk)
            if total_rows % 500000 == 0:
                print(f'  Processed: {total_rows:,} rows')

    print(f'\nDone! Total rows: {total_rows:,}')
    print(f'Output: {output_csv}')
    if output_csv.exists():
        print(f'Size: {output_csv.stat().st_size / 1e9:.2f} GB')

Processing: 1-100m.ndjson.zst
  Processed: 500,000 rows
  Processed: 1,000,000 rows
  Processed: 1,500,000 rows
  Processed: 2,000,000 rows
  Processed: 2,500,000 rows
  Processed: 3,000,000 rows
  Processed: 3,500,000 rows
  Processed: 4,000,000 rows
  Processed: 4,500,000 rows
  Processed: 5,000,000 rows
  Processed: 5,500,000 rows
  Processed: 6,000,000 rows
  Processed: 6,500,000 rows
  Processed: 7,000,000 rows
  Processed: 7,500,000 rows
  Processed: 8,000,000 rows
  Processed: 8,500,000 rows
  Processed: 9,000,000 rows
  Processed: 9,500,000 rows
  Processed: 10,000,000 rows
  Processed: 10,500,000 rows
  Processed: 11,000,000 rows
  Processed: 11,500,000 rows
  Processed: 12,000,000 rows
  Processed: 12,500,000 rows
  Processed: 13,000,000 rows
  Processed: 13,500,000 rows
  Processed: 14,000,000 rows
  Processed: 14,500,000 rows
  Processed: 15,000,000 rows
  Processed: 15,500,000 rows
  Processed: 16,000,000 rows
  Processed: 16,500,000 rows
  Processed: 17,000,000 rows
  Pro