# 04 — Targets, Feature und Matrizen erstellen

## Ziel
Dieses Notebook erstellt die Zielvariablen (Targets) und die Feature-Matrizen für die Modellierung. Es speichert die vorbereiteten Daten und Modell-Pipelines für die spätere Verwendung.

Am Ende erzeugen wir:
- Zielvariablen (Targets) für die Modellierung
- Feature-Matrizen für die Modellierung


## Imports und Setup


In [66]:
import numpy as np
import pandas as pd
from typing import Dict



from utils.features.multihot import top_k_list_counts,genres_to_multihot,ensure_list_fast
from utils.features.time_features import pick_release_cols_fast


from utils.config.settings import (
HIT_PERCENTILE,
MOOD_TAGS,
TOP_K_GENRES,
ALLOW_TEXT_FEATURES
)

import importlib
import utils.core.paths as paths

importlib.reload(paths)

SAMPLE_NAME = paths.load_sample_name()
PATHS = paths.make_paths(SAMPLE_NAME)
paths.ensure_dirs(PATHS)



## Laden von Daten

In [67]:
track_df = pd.read_parquet(PATHS.modeling_dir / "track_dataset.parquet")
album_df = pd.read_parquet(PATHS.modeling_dir / "album_dataset.parquet")
artist_df = pd.read_parquet(PATHS.modeling_dir / "artist_dataset.parquet")


dur_col = "duration" if "duration" in track_df.columns else ("duration_ms" if "duration_ms" in track_df.columns else None)


## Target-Variablen
Wir definieren mehrere Zielvariablen (Targets) für verschiedene Modellierungsaufgaben:

- **(A) Success Percentile within Cohort**
  *Ranking / Top-K Target*
  → relative Erfolgsposition eines Tracks innerhalb seines Release-Kohortenjahres

- **(B) Success Residual within Cohort**
  *Overperformance-Target*
  → misst, ob ein Track **besser oder schlechter als erwartet** performt (relativ zur Kohorte)

- **(C) Hit-Vorhersage**
  *Binäres Klassifikationsziel*
  → basiert auf **jahresrelativen Popularitäts-Schwellenwerten** (robust gegen Zeit- und Katalog-Bias)

- **(D) Mood-Tags**
  *Multi-Label-Target (schwache Supervision)*
  → abgeleitet aus Quantilen ausgewählter Audio-Features
  (z. B. Energy, Valence, Danceability)

- **(E) Artist Trajectory Target**
  *Zeitverschobenes Wachstums-Target*
  → zukünftige Popularitätsentwicklung eines Künstlers
  (z. B. Popularitäts-Δ oder Breakout-Indikator über einen festen Horizont)


In [68]:
# ---------------- PREPROCESSING (VECTORIZED) ----------------
# Fix release columns
for src, tgt in [("album_release_year", "release_year"), ("album_release_month", "release_month")]:
    if tgt not in track_df.columns and src in track_df.columns:
        track_df[tgt] = track_df[src]

YCOL_C, MCOL_C = pick_release_cols_fast(track_df)

# Cohort timestamp (vectorized)
track_df["cohort_ym"] = (
        pd.to_numeric(track_df[YCOL_C], errors="coerce").astype("Int64") * 100 +
        pd.to_numeric(track_df[MCOL_C], errors="coerce").astype("Int64")
).astype("Int64")

pop = pd.to_numeric(track_df["popularity"], errors="coerce").astype("float64")

# ------------------------------------------------------------
# (A) + (B): Cohort-based Track Targets - FIXED SINGLE PASS
# ------------------------------------------------------------
# FIX: Separate agg calls (pandas compatibility)
cohort_ranks = track_df.groupby("cohort_ym", sort=False)["popularity"].rank(pct=True) * 100
cohort_means = track_df.groupby("cohort_ym", sort=False)["popularity"].transform("mean")

track_df["success_pct_in_cohort"] = cohort_ranks.astype("float64")
track_df["success_residual_in_cohort"] = (pop - cohort_means).astype("float64")

y_success_pct = track_df["success_pct_in_cohort"].astype("float64")
y_success_residual = track_df["success_residual_in_cohort"].astype("float64")


# ------------------------------------------------------------
# (C) Hit Label (robust, year-relative) - OPTIMIZED
# ------------------------------------------------------------
def build_hit_labels_fast(df: pd.DataFrame, hit_percentile: float = 0.90,
                          desired_rate: float = 0.10, min_tracks_per_year: int = 200) -> pd.Series:
    pop_ = pd.to_numeric(df["popularity"], errors="coerce").astype("float64")
    year_ = pd.to_numeric(df.get("release_year", np.nan), errors="coerce").round().astype("Int64")

    # Global fallback threshold (non-zero only)
    nz_pop = pop_[(pop_ > 0) & pop_.notna()]
    global_thr = float(nz_pop.quantile(hit_percentile)) if len(nz_pop) > 0 else 0.0

    # Year-specific thresholds (vectorized)
    year_pop = pd.DataFrame({"year": year_, "pop": pop_}).dropna(subset=["year", "pop"])
    if len(year_pop) > 0:
        year_counts = year_pop["year"].value_counts()
        good_years = year_counts[year_counts >= min_tracks_per_year].index
        good_pop = year_pop[year_pop["year"].isin(good_years)]

        if len(good_pop) > 0:
            year_thr = good_pop.groupby("year")["pop"].quantile(hit_percentile)
            thr_map = year_.map(year_thr).fillna(global_thr)
        else:
            thr_map = pd.Series(global_thr, index=df.index)
    else:
        thr_map = pd.Series(global_thr, index=df.index)

    y = (pop_ >= thr_map).fillna(False).astype("int8")

    # Ensure minimum hit rate
    if y.mean() < desired_rate:
        n = len(y)
        k = max(1, int(desired_rate * n))
        top_idx = pop_.fillna(-np.inf).nlargest(k).index
        y = pd.Series(0, index=y.index, dtype="int8")
        y.loc[top_idx] = 1

    return y


y_hit = build_hit_labels_fast(track_df, hit_percentile=float(HIT_PERCENTILE))

# ------------------------------------------------------------
# (E) Mood Tags - OPTIMIZED
# ------------------------------------------------------------
mood_thresholds: Dict[str, float] = {}
for name, col, q, direction in MOOD_TAGS:
    if col in track_df.columns:
        vals = pd.to_numeric(track_df[col], errors="coerce").dropna()
        mood_thresholds[name] = float(vals.quantile(q)) if len(vals) > 0 else np.nan


def build_mood_labels_fast(df: pd.DataFrame, thresholds: Dict[str, float]) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index, dtype="int8")
    for name, thr in thresholds.items():
        if np.isnan(thr):
            out[name] = 0
            continue
        col = next((c for n, c, _, _ in MOOD_TAGS if n == name), None)
        if col not in df.columns:
            out[name] = 0
            continue
        x = pd.to_numeric(df[col], errors="coerce")
        direction = next((d for n, c, _, d in MOOD_TAGS if n == name), "gt")
        if direction == "gt":
            out[name] = (x >= thr).fillna(False).astype("int8")
        else:
            out[name] = (x <= thr).fillna(False).astype("int8")
    return out


Y_mood = build_mood_labels_fast(track_df, mood_thresholds)

# -----------------------------
# (F) Artist Trajectory Targets (OPTIMIZED)
# -----------------------------
PAST_M = 6
FUTURE_M = 6
MIN_PAST_TRACKS = 5
BREAKOUT_Q = 0.90

# Create release_month_ts if missing
if "release_month_ts" not in track_df.columns:
    date_cols = ["album_release_date_parsed", "release_date_parsed", "release_date"]
    used_date = next((c for c in date_cols if c in track_df.columns), None)
    if used_date:
        track_df["release_month_ts"] = pd.to_datetime(track_df[used_date], errors="coerce").dt.to_period(
            "M").dt.to_timestamp()
    else:
        track_df["release_month_ts"] = pd.to_datetime(
            track_df["release_year"].astype(str) + "-" + track_df["release_month"].astype(str) + "-01",
            errors="coerce"
        )

# Audio columns (limited for speed)
AUDIO_COLS = [c for c in ["danceability", "energy", "valence", "tempo"] if c in track_df.columns]

# Track ID
TRACK_ID_COL = "track_id" if "track_id" in track_df.columns else "id"
assert TRACK_ID_COL in track_df.columns

track_df["_hit"] = y_hit.astype("int8")

# ---------------- BASE TRACKS ----------------
base_cols = [TRACK_ID_COL, "release_month_ts", "popularity", "_hit"] + AUDIO_COLS[:2]
base_tracks = track_df[base_cols].copy()
base_tracks.columns = ["track_id", "release_month_ts", "popularity", "hit"] + AUDIO_COLS[:2]
base_tracks["track_id"] = base_tracks["track_id"].astype(str)
base_tracks["popularity"] = pd.to_numeric(base_tracks["popularity"], errors="coerce")
base_tracks["hit"] = base_tracks["hit"].astype("int8")

for c in AUDIO_COLS[:2]:
    if c in base_tracks.columns:
        base_tracks[c] = pd.to_numeric(base_tracks[c], errors="coerce").fillna(0)

base_tracks = base_tracks.dropna(subset=["track_id", "release_month_ts", "popularity"]).drop_duplicates("track_id")

# ---------------- TRACK-ARTIST MAPPING ----------------
if "artist_id" in track_df.columns:
    ta = track_df[[TRACK_ID_COL, "artist_id"]][[TRACK_ID_COL, "artist_id"]].copy()
    ta.columns = ["track_id", "artist_id"]
elif "artist_ids" in track_df.columns:
    ta = track_df[[TRACK_ID_COL, "artist_ids"]].copy()
    ta.columns = ["track_id", "artist_ids"]
    ta["track_id"] = ta["track_id"].astype(str)
    ta["artist_ids"] = ta["artist_ids"].apply(ensure_list_fast)
    ta = ta.explode("artist_ids").rename(columns={"artist_ids": "artist_id"})
else:
    raise ValueError("Need 'artist_id' or 'artist_ids'")

ta["track_id"] = ta["track_id"].astype(str)
ta["artist_id"] = ta["artist_id"].astype(str)

# Weights
n_artists = ta.groupby("track_id")["artist_id"].size()
ta["w"] = ta["track_id"].map(1.0 / n_artists)

# Merge
traj_tracks = ta.merge(base_tracks, on="track_id", how="inner")
traj_tracks["w_pop"] = traj_tracks["w"] * traj_tracks["popularity"]
traj_tracks["w_hit"] = traj_tracks["w"] * traj_tracks["hit"]
traj_tracks["artist_id"] = traj_tracks["artist_id"].astype("category")

# ---------------- MONTHLY AGG ----------------
artist_month = traj_tracks.groupby(["artist_id", "release_month_ts"], sort=False, observed=True).agg({
    "track_id": "nunique",
    "w": "sum",
    "w_pop": "sum",
    "popularity": "max",
    "w_hit": "sum"
}).reset_index()

artist_month.columns = ["artist_id", "release_month_ts", "n_unique_tracks_month", "n_tracks_month",
                        "pop_sum_month", "pop_max_month", "hit_sum_month"]
artist_month["pop_mean_month"] = artist_month["pop_sum_month"] / artist_month["n_tracks_month"]

artist_month = artist_month.sort_values(["artist_id", "release_month_ts"]).reset_index(drop=True)

# ---------------- ROLLING FEATURES ----------------
g = artist_month.groupby("artist_id", sort=False, observed=True)

artist_month["past_tracks"] = g["n_tracks_month"].transform(lambda s: s.rolling(PAST_M, min_periods=1).sum())
artist_month["past_unique_tracks"] = g["n_unique_tracks_month"].transform(
    lambda s: s.rolling(PAST_M, min_periods=1).sum())
artist_month["past_pop_mean"] = g["pop_mean_month"].transform(lambda s: s.rolling(PAST_M, min_periods=1).mean())
artist_month["past_hit_sum"] = g["hit_sum_month"].transform(lambda s: s.rolling(PAST_M, min_periods=1).sum())

# Future (shift method)
artist_month["future_pop_mean"] = g["pop_mean_month"].transform(
    lambda s: s.shift(-FUTURE_M).rolling(FUTURE_M, min_periods=1).mean())
artist_month["future_tracks"] = g["n_tracks_month"].transform(
    lambda s: s.shift(-FUTURE_M).rolling(FUTURE_M, min_periods=1).sum())

# ---------------- TARGETS ----------------
artist_panel = artist_month[
    (artist_month["past_unique_tracks"] >= MIN_PAST_TRACKS) &
    artist_month["future_tracks"].notna() & (artist_month["future_tracks"] > 0)
    ].copy().reset_index(drop=True)

artist_panel["y_growth"] = artist_panel["future_pop_mean"] - artist_panel["past_pop_mean"]
artist_panel["year"] = artist_panel["release_month_ts"].dt.year
artist_panel["y_breakout"] = artist_panel.groupby("year")["y_growth"].transform(
    lambda x: (x >= x.quantile(BREAKOUT_Q)).astype("int8")
)

y_artist_growth = artist_panel["y_growth"].astype("float64")
y_artist_breakout = artist_panel["y_breakout"].astype("int8")

# ---------------- DEBUG ----------------
print(" TRACK TARGETS:")
print(f"  y_success_pct: {y_success_pct.shape} | miss: {y_success_pct.isna().mean():.1%}")
print(f"  y_success_residual: {y_success_residual.shape} | miss: {y_success_residual.isna().mean():.1%}")
print(f"  y_hit rate: {y_hit.mean():.1%}")
print(f"  Y_mood: {Y_mood.shape}")

print("\n ARTIST PANEL:")
print(f"  artist_panel: {artist_panel.shape}")
print(f"  y_growth: {y_artist_growth.mean():.2f} ± {y_artist_growth.std():.2f}")
print(f"  y_breakout rate: {y_artist_breakout.mean():.1%}")
print(" READY!")


 TRACK TARGETS:
  y_success_pct: (300000,) | miss: 1.8%
  y_success_residual: (300000,) | miss: 1.8%
  y_hit rate: 20.6%
  Y_mood: (300000, 7)

 ARTIST PANEL:
  artist_panel: (969, 17)
  y_growth: 0.23 ± 3.14
  y_breakout rate: 19.5%
 READY!


## Genre Multi-Hot (Top-K) für Track / Album / Artist

Genres liegen als **Listen** vor
(z. B. `track_genres = [genre_id1, genre_id2, ...]`).

Da die meisten ML-Modelle **feste numerische Vektoren** benötigen, gehen wir wie folgt vor:

1. **Top-K Genres bestimmen**
   – die K häufigsten Genres aus `track_df`

2. **Multi-Hot Encoding erzeugen**
   – für jedes Top-K-Genre eine 0/1-Spalte
   – 1 = Genre vorhanden, 0 = Genre nicht vorhanden

### Warum Top-K?

- Der komplette Genre-Raum ist sehr groß
- Top-K hält die Feature-Dimension **überschaubar**
- Vermeidet extrem sparse Feature-Matrizen
- Seltene Genres werden implizit als **„Other“** behandelt
  (alle 0 in den Top-K-Spalten)



In [69]:
top_genres = top_k_list_counts(track_df["track_genres"],
                               top_k=TOP_K_GENRES) if "track_genres" in track_df.columns else []

track_genre_mh = (
    genres_to_multihot(track_df, "track_genres", top_genres, prefix="track_")
    if top_genres else pd.DataFrame(index=track_df.index)
)
album_genre_mh = (
    genres_to_multihot(album_df, "album_genres", top_genres, prefix="album_")
    if (top_genres and "album_genres" in album_df.columns) else pd.DataFrame(index=album_df.index)
)
artist_genre_mh = (
    genres_to_multihot(artist_df, "artist_genres", top_genres, prefix="artist_")
    if (top_genres and "artist_genres" in artist_df.columns) else pd.DataFrame(index=artist_df.index)
)

print("Genre multi-hot shapes:", track_genre_mh.shape, album_genre_mh.shape, artist_genre_mh.shape)


Genre multi-hot shapes: (300000, 0) (195938, 0) (187440, 0)


## Feature Selection & Leakage Guards (Track + Artist Panel)

In diesem Schritt entscheiden wir **welche Spalten als Model-Inputs erlaubt sind**.
Wir trennen bewusst:

- **`track_df`**: reichhaltige, denormalisierte Tabelle (enthält auch IDs/URLs/Debug-Spalten)
- **`X_*`**: saubere Feature-Matrizen für ML (nur erlaubte numerische / kategorische Features + Genre-Vektoren)
- **`artist_panel`**: separate Artist-Month Panel-Tabelle für das Trajectory-Modell (eigene Features + Targets)

### Prinzipien

1. **No-ID / No-URL policy**
   - Spalten wie `track_id`, `album_id`, `artist_id`, `audio_feature_id`, URLs/URIs dürfen **niemals** in `X` landen.
   - Sonst drohen Memorization und „fake“ Performance.

2. **Task-spezifische Leakage-Regeln**
   - Für Track-Targets gilt:
     - **Nie**: `popularity` in Features, sobald Popularity selbst (direkt/indirekt) ein Target definiert
       (A/B/C basieren alle auf Popularity).
   - Für Artist-Trajectory gilt:
     - **Keine Zukunftsinfos**: Features nur aus Vergangenheit (Rolling Window), Targets aus Zukunft.

3. **Reproduzierbares Feature-Schema**
   - Wir definieren Feature-Gruppen (numeric/categorical/genres) und erzeugen daraus konsistente `X_*` Matrizen,
     die später auch im Inference-Notebook wiederverwendet werden können.


#### Track Feature Selection

In [70]:
# ------------------------------------------------------------
# Global guardrails: never allow these as model inputs
# ------------------------------------------------------------
NEVER_FEATURE_COLS = {
    # IDs
    "track_id", "album_id", "artist_id", "audio_feature_id",
    "id",  # sometimes merged tables have generic id col
    # URLs / URIs
    "analysis_url", "preview_url", "href", "uri", "spotify_url",
    # raw list columns (use multi-hot instead)
    "artist_ids", "track_genres", "album_genres",
    # names / free text (optional: allow engineered text only)
    "name",
    # dates in raw form (use derived time features instead)
     "release_date",
    # targets / target sources (avoid leakage by default)
    "popularity",
    "success_pct_in_cohort",
    "success_residual_in_cohort",
}

# ------------------------------------------------------------
# Choose duration column safely (optional)
# ------------------------------------------------------------
duration_feature = None
if "dur_col" in globals() and isinstance(dur_col, str) and dur_col.strip() and (dur_col in track_df.columns):
    duration_feature = dur_col
elif "duration_ms" in track_df.columns:
    duration_feature = "duration_ms"

# ------------------------------------------------------------
# Audio feature columns (policy-driven)
# ------------------------------------------------------------
# POLICY_AUDIO should contain main audio features you want (e.g. acousticness, energy, danceability...)
# fallback if it doesn't exist

POLICY_AUDIO = [
    "acousticness", "danceability", "energy", "instrumentalness", "liveness",
    "speechiness", "valence", "loudness", "tempo"
]

if "POLICY_AUDIO" in globals():
    track_audio_main = [c for c in POLICY_AUDIO if c in track_df.columns]
else:
    track_audio_main = [c for c in [
        "acousticness", "danceability", "energy", "instrumentalness",
        "liveness", "loudness", "speechiness", "tempo", "valence"
    ] if c in track_df.columns]

track_audio_extra = [c for c in ["key", "mode", "time_signature"] if c in track_df.columns]

# ------------------------------------------------------------
# Base numeric features (SAFE for your current track models)
# NOTE:
# - we exclude popularity + any derived targets by NEVER_FEATURE_COLS
# - we exclude artist popularity/followers proxies by default (they can make models trivial)
# ------------------------------------------------------------
TRACK_NUMERIC = [
                    # basic track structure
                    "disc_number", "track_number",
                    *([duration_feature] if duration_feature else []),
                    "log_duration",
                    "has_preview",
                    "has_audio_features",
                    # release time
                    "release_year", "release_month", "release_decade",
                    # collaboration / metadata
                    "n_artists",
                ] + track_audio_main + track_audio_extra

# Optional engineered text (safe)
if "ALLOW_TEXT_FEATURES" in globals() and ALLOW_TEXT_FEATURES:
    TRACK_NUMERIC += [c for c in ["name_len", "name_words"] if c in track_df.columns]

# ------------------------------------------------------------
# Optional reach proxies (OFF by default: makes evaluation more realistic)
# ------------------------------------------------------------
REACH_PROXY_COLS = [
    "artist_popularity_mean", "artist_popularity_max",
    "artist_followers_mean", "artist_followers_max",
    "log_artist_followers_mean", "log_artist_followers_max",
    "album_popularity",  # proxy leak for popularity-related models
]

ALLOW_REACH_PROXIES = bool(globals().get("ALLOW_LEAKY_FEATURES", False))
if ALLOW_REACH_PROXIES:
    TRACK_NUMERIC += [c for c in REACH_PROXY_COLS if c in track_df.columns]

# ------------------------------------------------------------
# Categoricals
# ------------------------------------------------------------
TRACK_CATEGORICAL = [c for c in ["album_type"] if c in track_df.columns]

# ------------------------------------------------------------
# Apply final guards
# ------------------------------------------------------------
TRACK_NUMERIC = [
    c for c in TRACK_NUMERIC
    if c in track_df.columns and c not in NEVER_FEATURE_COLS
]
TRACK_CATEGORICAL = [
    c for c in TRACK_CATEGORICAL
    if c in track_df.columns and c not in NEVER_FEATURE_COLS
]

# Build base matrix
X_track_base = track_df[TRACK_NUMERIC + TRACK_CATEGORICAL].copy()

# Append genre multi-hot if provided
if "track_genre_mh" in globals() and isinstance(track_genre_mh, pd.DataFrame):
    X_track = pd.concat(
        [X_track_base.reset_index(drop=True), track_genre_mh.reset_index(drop=True)],
        axis=1
    )
else:
    X_track = X_track_base

print("X_track shape:", X_track.shape)
print("Numeric cols:", len(TRACK_NUMERIC), "| Categorical cols:", len(TRACK_CATEGORICAL))
print("Has genre multi-hot:",
      "track_genre_mh" in globals() and isinstance(globals().get("track_genre_mh"), pd.DataFrame))
print("Reach proxies enabled:", ALLOW_REACH_PROXIES)


X_track shape: (300000, 25)
Numeric cols: 24 | Categorical cols: 1
Has genre multi-hot: True
Reach proxies enabled: False


## Speicher von Datasets

In [71]:
# ------------------------------------------------------------
# Export: Datasets für Notebook 05 (Training)
# ------------------------------------------------------------
# Ziel:
# - Alle X/y Artefakte einmalig und konsistent speichern
# - Keine Duplikate / keine widersprüchlichen Dateinamen
# - Robust gegenüber Ausführungsreihenfolge (optional vorhandene Objekte)
# ------------------------------------------------------------

out_dir = PATHS.input_targets_path
out_dir.mkdir(parents=True, exist_ok=True)

# -------------------------
# Track-level: Basismatrix (optional, aber oft praktisch)
# -------------------------
X_track.to_parquet(out_dir / "X_track_full.parquet", index=False)

# -------------------------
# (A) Success Percentile within Cohort
# -------------------------

pd.Series(y_success_pct).to_frame().to_parquet(
    out_dir / "y_success_pct.parquet", index=False
)

# -------------------------
# (B) Success Residual within Cohort
# -------------------------

pd.Series(y_success_residual).to_frame().to_parquet(
    out_dir / "y_success_residual.parquet", index=False
)

# -------------------------
# (C) Hit Prediction
# -------------------------
pd.Series(y_hit).to_frame().to_parquet(
    out_dir / "y_hit.parquet", index=False
)


# -------------------------
# (E) Mood Multi-Label
# -------------------------
Y_mood.to_parquet(out_dir / "Y_mood.parquet", index=False)

# -------------------------
# (F) Artist Trajectory / Panel
# -------------------------

artist_panel.to_parquet(out_dir / "artist_panel.parquet", index=False)


track_genre_mh.to_parquet(out_dir / "track_genre_multihot.parquet", index=False)
artist_genre_mh.to_parquet(out_dir / "artist_genre_multihot.parquet", index=False)
album_genre_mh.to_parquet(out_dir / "album_genre_multihot.parquet", index=False)

track_df.to_parquet(out_dir / "track_dataset_full.parquet", index=False)
artist_df.to_parquet(out_dir / "artist_dataset_full.parquet", index=False)
album_df.to_parquet(out_dir / "album_dataset_full.parquet", index=False)

print("Datasets gespeichert in:", out_dir.resolve())
print("Files:")
for p in sorted(out_dir.glob("*.parquet")):
    print(" -", p.name)



Datasets gespeichert in: C:\GitHub\uni-project-metrics-and-data\data\baseline_models_datasets\slice_001
Files:
 - album_dataset_full.parquet
 - album_genre_multihot.parquet
 - artist_dataset_full.parquet
 - artist_genre_multihot.parquet
 - artist_panel.parquet
 - track_dataset_full.parquet
 - track_genre_multihot.parquet
 - X_track_full.parquet
 - y_hit.parquet
 - Y_mood.parquet
 - y_success_pct.parquet
 - y_success_residual.parquet
