# 02 — Cleaning & Rules

## Zweck
Dieses Notebook bereinigt den exportierten **verbundenen Subgraphen** (bis zu **300k Tracks**), erzwingt einen **stabilen Datenvertrag (Data Contract)** und erzeugt eine **ML-fertige Clean-Layer**.

## Input
- `../data/interim/converted_sqlite/*.csv`

## Output
- `../data/processed/clean_csv/*.csv`
- `../data/processed/parquet/*.parquet`
- `../data/reports/02_cleaning_and_rules/cleaning_report.json`

## Erwartete Tabellen (aus dem Exporter)
| Tabelle | Beschreibung | Schlüssel / Beziehung |
|---|---|---|
| `tracks` | Track-Stammdaten | PK: `track_id` |
| `audio_features` | Audio-Features pro Track | PK: `id` |
| `albums` | Album-Stammdaten | PK: `id` |
| `artists` | Artist-Stammdaten | PK: `id` |
| `genres` | Genre-Stammdaten | PK: `id` |
| `r_albums_tracks` | Zuordnung Album ↔ Track | (`album_id`, `track_id`) |
| `r_track_artist` | Zuordnung Track ↔ Artist | (`track_id`, `artist_id`) |
| `r_artist_genre` | Zuordnung Artist ↔ Genre | (`genre_id`, `artist_id`) |
| `r_albums_artists` | Zuordnung Album ↔ Artist | (`album_id`, `artist_id`)|

## Ergebnis
Am Ende steht eine **konsistente, validierte und reproduzierbare** Datenbasis:
- bereinigte CSVs (für schnelle Inspektion),
- Parquet (für effizientes Training/Batching),
- ein JSON-Report mit Regeln, Checks und Statistiken.


## Imports

In [3]:
from __future__ import annotations

import json
import math
import re
import time
import platform
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

# Reproductibility

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
pd.set_option("display.max_columns", 250)
pd.set_option("display.max_rows", 40)
pd.set_option("display.width", 160)

pd.options.mode.copy_on_write = True

## Config and Paths

In [4]:
@dataclass(frozen=True)
class PipelinePaths:
    raw_dir: Path = Path("../data/interim/converted_sqlite")
    clean_dir: Path = Path("../data/processed/clean_csv")
    parquet_dir: Path = Path("../data/processed/parquet")
    report_path: Path = Path("../data/reports/02_cleaning_and_rules")

PATHS = PipelinePaths()

for p in [PATHS.clean_dir,PATHS.report_path,PATHS.raw_dir,PATHS.parquet_dir]:
    p.mkdir(parents=True, exist_ok=True)

@dataclass(frozen=True)
class CleaningPolicy:
    drop_orphan_bridge_rows:bool = True
    clip_popularity:bool = True
    popularity_min:int = 0
    popularity_max:int = 0
    duration_cap_quantile:float = 0.999
    tempo_cap_quantile:float = 0.999
    audio_01_cols:Tuple[str,...] = (
        "acousticness", "danceability", "energy", "instrumentalness",
        "liveness", "speechiness", "valence"
    )
    loudness_range:Tuple[float,float] = (-60.0, 5.0)
    key_range:Tuple[int,int] = (0,11)
    mode_values:Tuple[int,int] = (0,1)


POLICY = CleaningPolicy()

RUN_META = {
    "run_ts_unix": int(time.time()),
    "python": platform.python_version(),
    "platform": platform.platform(),
    "pandas": pd.__version__,
    "random_seed": RANDOM_SEED,
    "paths": {k: str(v) for k, v in asdict(PATHS).items()},
    "policy": asdict(POLICY),
}

## Helper Utilities

In [None]:
def snake_case(s:str) -> str:
    s = s.strip()
    s = re.sub(r"[^\w]+","_",s)
    s = re.sub(r"__+","_",s)
    return s.strip("_").lower()

def norm_str(s: pd.Series) -> pd.Series:
    """Normalize whitespace and empty strings to NA."""
    s = s.astype("string")
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    s = s.replace("", pd.NA)
    return s
def to_int(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce").astype("float64")

def to_bool(s: pd.Series) -> pd.Series:
    """Robust boolean parser to pandas BooleanDtype."""
    x = s.astype("string").str.lower().str.strip()
    out = pd.Series(pd.NA, index=s.index, dtype="boolean")
    out[x.isin(["1", "true", "t", "yes", "y"])] = True
    out[x.isin(["0", "false", "f", "no", "n"])] = False
    return out

def memory_mb(df: pd.DataFrame) -> float:
    return float(df.memory_usage(deep=True).sum()) / (1024 ** 2)

def keep_most_complete_row(df: pd.DataFrame, key_cols: List[str]) -> pd.DataFrame:
    """Resolve duplicates by keeping the row with most non-null values."""
    df = df.copy()
    df["_nonnulls"] = df.notna().sum(axis=1)
    df = df.sort_values("_nonnulls", ascending=False)
    df = df.drop_duplicates(subset=key_cols, keep="first")
    df = df.drop(columns=["_nonnulls"])
    return df.reset_index(drop=True)

def clip_series(s: pd.Series, lo: float, hi: float) -> pd.Series:
    return s.clip(lower=lo, upper=hi)

def parse_date_any(s: pd.Series) -> pd.Series:
    # Handles YYYY, YYYY-MM, YYYY-MM-DD (Spotify style)
    return pd.to_datetime(s, errors="coerce")

def assert_gate(condition: bool, msg: str):
    if not condition:
        raise AssertionError(f"QUALITY GATE FAILED: {msg}")


@dataclass
class TableProfile:
    name: str
    rows: int
    cols: int
    memory_mb: float
    missing_by_col: Dict[str, int]
    duplicate_rows_full: Optional[int] = None
    duplicate_rows_on_keys: Optional[int] = None

def profile_table(df: pd.DataFrame, name: str, key_cols: Optional[List[str]] = None) -> TableProfile:
    missing = {c: int(df[c].isna().sum()) for c in df.columns}
    prof = TableProfile(
        name=name,
        rows=int(len(df)),
        cols=int(df.shape[1]),
        memory_mb=round(memory_mb(df), 2),
        missing_by_col=missing,
    )
    if key_cols:
        prof.duplicate_rows_on_keys = int(df.duplicated(subset=key_cols).sum())
    else:
        prof.duplicate_rows_full = int(df.duplicated().sum())
    return prof