# EraEx: NDJSON â†’ CSV Conversion (Colab)

This notebook reads `.ndjson.zst` files and outputs a **CSV file** for combining with your existing dataset.

**Output**: `data/raw/ndjson_converted.csv`

In [None]:
%pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
import json
import zstandard as zstd
import polars as pl
from datetime import date
from typing import Iterator, Dict, Any, Optional

PROJECT_DIR = Path('/content/drive/MyDrive/EraEx')
RAW_DIR = PROJECT_DIR / 'data' / 'raw'

RAW_DIR.mkdir(parents=True, exist_ok=True)

YEAR_RANGE = range(2012, 2019)

print(f'Raw data: {RAW_DIR}')

In [None]:
ndjson_files = list(RAW_DIR.glob('*.ndjson.zst')) + list(RAW_DIR.glob('*.ndjson'))
print(f'Found {len(ndjson_files)} NDJSON file(s):')
for f in ndjson_files:
    print(f'  - {f.name} ({f.stat().st_size / 1e9:.2f} GB)')

In [None]:
NDJSON_FIELD_MAPPING = {
    'id': 'track_id',
    'title': 'title',
    'user.username': 'artist',
    'genre': 'genre',
    'tag_list': 'tags',
    'description': 'description',
    'playback_count': 'playback_count',
    'permalink_url': 'permalink_url',
    'created_at': 'created_at',
}

def extract_nested_field(obj: dict, path: str) -> Any:
    parts = path.split('.')
    current = obj
    for part in parts:
        if current is None:
            return None
        if isinstance(current, dict):
            current = current.get(part)
        else:
            return None
    return current

def parse_ndjson_line(line: str) -> Optional[Dict[str, Any]]:
    try:
        obj = json.loads(line)
    except json.JSONDecodeError:
        return None
    
    mapped = {}
    for source_field, target_field in NDJSON_FIELD_MAPPING.items():
        value = extract_nested_field(obj, source_field)
        mapped[target_field] = value
    
    if mapped.get('created_at'):
        try:
            mapped['year'] = int(mapped['created_at'][:4])
        except (ValueError, TypeError):
            mapped['year'] = None
    else:
        mapped['year'] = None
    
    mapped['track_id'] = str(mapped.get('track_id', '')) if mapped.get('track_id') else None
    
    return mapped

In [None]:
def stream_ndjson_zst(file_path: Path, chunk_size: int = 10000) -> Iterator[list]:
    dctx = zstd.ZstdDecompressor()
    
    with open(file_path, 'rb') as fh:
        with dctx.stream_reader(fh) as reader:
            buffer = b''
            chunk = []
            
            while True:
                data = reader.read(1024 * 1024)
                if not data:
                    break
                
                buffer += data
                lines = buffer.split(b'\n')
                buffer = lines[-1]
                
                for line in lines[:-1]:
                    if not line.strip():
                        continue
                    
                    parsed = parse_ndjson_line(line.decode('utf-8', errors='ignore'))
                    if parsed and parsed.get('year') in YEAR_RANGE:
                        chunk.append(parsed)
                    
                    if len(chunk) >= chunk_size:
                        yield chunk
                        chunk = []
            
            if buffer.strip():
                parsed = parse_ndjson_line(buffer.decode('utf-8', errors='ignore'))
                if parsed and parsed.get('year') in YEAR_RANGE:
                    chunk.append(parsed)
            
            if chunk:
                yield chunk

In [None]:
output_csv = RAW_DIR / 'ndjson_converted.csv'
first_chunk = True
total_rows = 0

for ndjson_file in ndjson_files:
    print(f'Processing: {ndjson_file.name}')
    
    for chunk in stream_ndjson_zst(ndjson_file, chunk_size=50000):
        df = pl.DataFrame(chunk)
        
        if first_chunk:
            df.write_csv(output_csv)
            first_chunk = False
        else:
            with open(output_csv, 'ab') as f:
                df.write_csv(f, include_header=False)
        
        total_rows += len(chunk)
        if total_rows % 500000 == 0:
            print(f'  Processed: {total_rows:,} rows')

print(f'\nDone! Total rows: {total_rows:,}')
print(f'Output: {output_csv}')
print(f'Size: {output_csv.stat().st_size / 1e9:.2f} GB')