# 00b — Setup & Data Preparation (FMA Small)

Prepara il dataset FMA Small (8 generi). Usa stessi parametri audio di GTZAN (sr=22050, n_mels=128, durata segmenti ~3s).


In [1]:
# Paths & placeholders
import os, sys, json, pickle, numpy as np, subprocess, shutil, re, zipfile
from pathlib import Path
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[1]
FMA_ROOT = PROJECT_ROOT/'data'/'fma_small'
PROCESSED = PROJECT_ROOT/'data'/'processed_fma'
KAGGLE_DIR = PROJECT_ROOT/'kaggle'
PROCESSED.mkdir(parents=True, exist_ok=True)
print('FMA_ROOT:', FMA_ROOT)
print('PROCESSED:', PROCESSED)

# User options
ONLY_SMALL = True   # True => scarica solo la parte "small" (consigliato per ridurre dimensioni)
PREFER_KAGGLEHUB = False  # Se True, prova KaggleHub prima (potrebbe scaricare molto di più)

# Official mirror URLs (FMA project)
OFFICIAL_SMALL_URL = 'https://os.unil.cloud.switch.ch/fma/fma_small.zip'

# Helpers

def ensure_package(pkg_spec: str):
    try:
        __import__(pkg_spec.split('[')[0])
        return True
    except Exception:
        try:
            print(f'Installing {pkg_spec} ...')
            subprocess.run([sys.executable, '-m', 'pip', 'install', pkg_spec], check=True)
            return True
        except Exception as e:
            print(f'Failed to install {pkg_spec}:', e)
            return False


def setup_kaggle_api_creds():
    kaggle_json = KAGGLE_DIR/'kaggle.json'
    if kaggle_json.exists():
        kaggle_home = Path.home()/'.kaggle'
        kaggle_home.mkdir(exist_ok=True)
        dest = kaggle_home/'kaggle.json'
        if not dest.exists():
            shutil.copy2(kaggle_json, dest)
            dest.chmod(0o600)
        return True
    print('kaggle.json not found in', KAGGLE_DIR)
    return False


def kaggle_api_download_exact_file(dataset_slug: str, filename: str, dl_dir: Path) -> Path | None:
    """Download a specific file using the Kaggle API (Python) to avoid CLI dependency."""
    try:
        if not ensure_package('kaggle'):
            print('Cannot install kaggle package; aborting Kaggle API download.')
            return None
        from kaggle.api.kaggle_api_extended import KaggleApi
        api = KaggleApi()
        api.authenticate()  # relies on ~/.kaggle/kaggle.json
        print(f'Downloading {filename} from {dataset_slug} via Kaggle API ...')
        print(f'Dataset URL: https://www.kaggle.com/datasets/{dataset_slug}')
        api.dataset_download_file(dataset_slug, filename, path=str(dl_dir), force=True)
        return dl_dir/filename
    except Exception as e:
        print(f'Kaggle API download failed for {filename}:', e)
        return None


def http_download(url: str, dest_path: Path) -> bool:
    """Stream download a file over HTTP into dest_path using requests (installed on demand)."""
    if not ensure_package('requests'):
        print('Cannot install requests; aborting HTTP download.')
        return False
    import requests
    try:
        with requests.get(url, stream=True, timeout=60) as r:
            r.raise_for_status()
            total = int(r.headers.get('content-length', 0))
            chunk = 1024 * 1024
            written = 0
            with open(dest_path, 'wb') as f:
                for data in r.iter_content(chunk_size=chunk):
                    if data:
                        f.write(data)
                        written += len(data)
                        if total:
                            pct = 100.0 * written / total
                            print(f"Downloading... {written/1e6:.1f}MB/{total/1e6:.1f}MB ({pct:.1f}%)", end='\r')
        print('\nHTTP download complete:', dest_path)
        return True
    except Exception as e:
        print('HTTP download failed:', e)
        return False


def unzip_to_dir(zip_path: Path, out_dir: Path) -> bool:
    try:
        with zipfile.ZipFile(zip_path, 'r') as zf:
            zf.extractall(out_dir)
        print('Unzipped:', zip_path, '->', out_dir)
        return True
    except Exception as e:
        print('Unzip failed:', e)
        return False



FMA_ROOT: /home/alepot55/Desktop/projects/naml_project/data/fma_small
PROCESSED: /home/alepot55/Desktop/projects/naml_project/data/processed_fma


In [2]:
# Download logic (idempotent: riusa cache/zips se presenti e non riscarica)
dl_dir = PROJECT_ROOT/'data'/'fma_download'
dl_dir.mkdir(parents=True, exist_ok=True)

# Consider dataset present only if there are mp3 files available
mp3_present = FMA_ROOT.exists() and any(FMA_ROOT.rglob('*.mp3'))
if not mp3_present:
    used_download = False

    # Reuse previously extracted/cached content in dl_dir if available
    cand_dirs = list(dl_dir.rglob('fma_small'))
    small_dir = cand_dirs[0] if cand_dirs else None
    if small_dir and any(small_dir.rglob('*.mp3')):
        FMA_ROOT.mkdir(parents=True, exist_ok=True)
        for item in small_dir.iterdir():
            target = FMA_ROOT/item.name
            if item.is_dir():
                shutil.copytree(item, target, dirs_exist_ok=True)
            else:
                shutil.copy2(item, target)
        print('FMA small prepared from cache at', small_dir, '->', FMA_ROOT)
        used_download = True

    # Preferred: only download the "small" zip using Kaggle API (size-friendly)
    if not used_download and ONLY_SMALL:
        zip_path = dl_dir/'fma_small.zip'
        if setup_kaggle_api_creds():
            if zip_path.exists():
                print('Found existing zip (Kaggle):', zip_path, '— skip re-download.')
            else:
                # Known archive name in the official dataset
                kz = kaggle_api_download_exact_file('mdeff/fma', 'fma_small.zip', dl_dir)
                if kz and kz.exists():
                    print('Downloaded via Kaggle API:', kz)
                    zip_path = kz
            # Unzip if needed and prepare FMA_ROOT
            if zip_path.exists():
                if not small_dir or not small_dir.exists():
                    if unzip_to_dir(zip_path, dl_dir):
                        cand_dirs = list(dl_dir.rglob('fma_small'))
                        small_dir = cand_dirs[0] if cand_dirs else None
                if small_dir and small_dir.exists():
                    FMA_ROOT.mkdir(parents=True, exist_ok=True)
                    for item in small_dir.iterdir():
                        target = FMA_ROOT/item.name
                        if item.is_dir():
                            shutil.copytree(item, target, dirs_exist_ok=True)
                        else:
                            shutil.copy2(item, target)
                    print('FMA small ready at', FMA_ROOT)
                    used_download = True
                else:
                    print('Unzip complete but fma_small folder not found. Please check archive contents.')
        else:
            print('Kaggle credentials not configured; skipping Kaggle API path.')

    # Official HTTP mirror fallback (no Kaggle account required)
    if not used_download and ONLY_SMALL:
        print('Trying official FMA mirror (HTTP):', OFFICIAL_SMALL_URL)
        local_zip = dl_dir/'fma_small.zip'
        if local_zip.exists():
            print('Found existing zip (HTTP):', local_zip, '— skip re-download.')
        else:
            if not http_download(OFFICIAL_SMALL_URL, local_zip):
                print('HTTP download failed; cannot proceed with mirror path.')
        # Unzip if needed and prepare FMA_ROOT
        if local_zip.exists():
            if not small_dir or not small_dir.exists():
                if unzip_to_dir(local_zip, dl_dir):
                    cand_dirs = list(dl_dir.rglob('fma_small'))
                    small_dir = cand_dirs[0] if cand_dirs else None
            if small_dir and small_dir.exists():
                FMA_ROOT.mkdir(parents=True, exist_ok=True)
                for item in small_dir.iterdir():
                    target = FMA_ROOT/item.name
                    if item.is_dir():
                        shutil.copytree(item, target, dirs_exist_ok=True)
                    else:
                        shutil.copy2(item, target)
                print('FMA small ready at', FMA_ROOT)
                used_download = True
            else:
                print('Unzip complete but fma_small folder not found (HTTP mirror).')

    # Optional KaggleHub path (may download larger content)
    if not used_download and PREFER_KAGGLEHUB:
        if ensure_package('kagglehub[pandas-datasets]'):
            try:
                import kagglehub
                print('Attempting download via KaggleHub (imsparsh/fma-free-music-archive-small-medium) ...')
                base_path = kagglehub.dataset_download('imsparsh/fma-free-music-archive-small-medium')
                base_path = Path(base_path)
                print('KaggleHub local cache:', base_path)
                # Look for fma_small folder or zip inside downloaded dataset
                cand_dirs = list(base_path.rglob('fma_small'))
                small_dir = cand_dirs[0] if cand_dirs else None
                if not small_dir:
                    zips = list(base_path.rglob('*fma_small*.zip'))
                    if zips:
                        dz = zips[0]
                        out_dir = dz.parent
                        print('Unzipping', dz)
                        unzip_to_dir(dz, out_dir)
                        cand_dirs = list(out_dir.rglob('fma_small'))
                        small_dir = cand_dirs[0] if cand_dirs else None
                if small_dir and small_dir.exists():
                    FMA_ROOT.mkdir(parents=True, exist_ok=True)
                    for item in small_dir.iterdir():
                        target = FMA_ROOT/item.name
                        if item.is_dir():
                            shutil.copytree(item, target, dirs_exist_ok=True)
                        else:
                            shutil.copy2(item, target)
                    print('FMA small prepared at', FMA_ROOT)
                    used_download = True
                else:
                    print('KaggleHub: fma_small not found inside dataset cache.')
            except Exception as e:
                print('KaggleHub download attempt failed:', e)

    # Final fallback: skip download and require manual placement (keeps project offline-safe)
    if not used_download:
        print('Automatic download not completed. Please place fma_small manually in', FMA_ROOT)
else:
    print('FMA small already present at', FMA_ROOT, '— skipping download.')

FMA small already present at /home/alepot55/Desktop/projects/naml_project/data/fma_small — skipping download.


> Nota: per mantenere il progetto offline-safe, questo notebook non scarica automaticamente FMA. Posiziona `fma_small` in `data/fma_small`.


In [3]:
# Processing — use FMA metadata (tracks.csv) to assign genre_top labels and avoid stratify errors
import os
import re
import json
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import warnings
from contextlib import contextmanager, redirect_stderr, redirect_stdout

# Try to prefer ffmpeg backend for audioread if available (reduces mpg123 noise)
os.environ.setdefault("AUDIOREAD_PLUGIN", "ffmpeg")
os.environ.setdefault("AUDIOREAD_BACKEND", "ffmpeg")

@contextmanager
def quiet_audioio():
    """Suppress decoder stdout/stderr and warnings (mpg123/ffmpeg chatter)."""
    with open(os.devnull, 'w') as devnull:
        with redirect_stderr(devnull), redirect_stdout(devnull):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                yield

# Ensure dataset presence
if not FMA_ROOT.exists():
    raise FileNotFoundError(
        'Dataset FMA Small non trovato. Esegui prima la prima cella per scaricare via Kaggle API o mirror, oppure posiziona fma_small in data/fma_small.'
    )

# Ensure metadata (tracks.csv) is available; try to locate or download from official mirror (idempotent)
METADATA_ROOT = PROJECT_ROOT/'data'/'fma_metadata'
TRACKS_CSV = METADATA_ROOT/'tracks.csv'
OFFICIAL_METADATA_URL = 'https://os.unil.cloud.switch.ch/fma/fma_metadata.zip'
dl_dir = PROJECT_ROOT/'data'/'fma_download'
dl_dir.mkdir(parents=True, exist_ok=True)

if not TRACKS_CSV.exists():
    # Try to find tracks.csv under any previously downloaded folder
    cand = list(dl_dir.rglob('tracks.csv')) if dl_dir.exists() else []
    if cand:
        METADATA_ROOT = cand[0].parent
        TRACKS_CSV = cand[0]
    else:
        # Attempt to reuse existing meta zip or download if missing, then unzip
        print('tracks.csv non trovato. Cerco un fma_metadata.zip locale o scarico dal mirror ufficiale...')
        meta_zip = dl_dir/'fma_metadata.zip'
        if meta_zip.exists():
            print('Trovato fma_metadata.zip in cache:', meta_zip, '— skip re-download.')
        else:
            if 'http_download' in globals():
                if not http_download(OFFICIAL_METADATA_URL, meta_zip):
                    print('Download metadati (HTTP) fallito.')
            else:
                print('Helper http_download non disponibile: esegui la prima cella del notebook.')
        # Unzip only if tracks.csv still not present
        if not any(dl_dir.rglob('tracks.csv')):
            if 'unzip_to_dir' in globals() and meta_zip.exists():
                if unzip_to_dir(meta_zip, dl_dir):
                    pass
            else:
                print('Helper unzip_to_dir non disponibile o zip mancante; impossibile estrarre metadati.')
        cand = list(dl_dir.rglob('tracks.csv'))
        if cand:
            METADATA_ROOT = cand[0].parent
            TRACKS_CSV = cand[0]

if not TRACKS_CSV.exists():
    raise FileNotFoundError(
        'tracks.csv non disponibile. Scarica fma_metadata.zip manualmente dal sito ufficiale o esegui la prima cella per abilitare il download.'
    )

print('Usando metadata:', TRACKS_CSV)

# Load metadata; tracks.csv uses MultiIndex columns
tracks = pd.read_csv(TRACKS_CSV, header=[0, 1], index_col=0, low_memory=False)
# genre_top is at ('track', 'genre_top'), index is track_id
if ('track', 'genre_top') not in tracks.columns:
    raise RuntimeError('Colonna (track, genre_top) non trovata in tracks.csv: versione metadati inattesa.')

id_to_genre = tracks[('track', 'genre_top')]

# Gather mp3 files and map to genre_top via track id (filename stem)
mp3_files = sorted([p for p in FMA_ROOT.rglob('*.mp3')])
if len(mp3_files) == 0:
    raise RuntimeError('Nessun file .mp3 trovato in fma_small.')

files, labels = [], []
for p in mp3_files:
    try:
        tid = int(p.stem)
    except Exception:
        # Unexpected filename, skip
        continue
    lab = id_to_genre.get(tid)
    if pd.isna(lab):
        continue
    files.append(str(p))
    labels.append(str(lab))

if len(files) == 0:
    raise RuntimeError('Nessun file etichettato trovato: controlla che tracks.csv corrisponda a fma_small.')

# Filter out rare classes with < 2 samples to satisfy stratify
from collections import Counter
cnt = Counter(labels)
kept_classes = {c for c, n in cnt.items() if n >= 2}
if len(kept_classes) < len(cnt):
    dropped = sorted([c for c, n in cnt.items() if n < 2])
    print(f'Avviso: escludo {len(dropped)} classi con <2 campioni (per stratify):', dropped[:10], '...')

files, labels = zip(*[(f, y) for f, y in zip(files, labels) if y in kept_classes])
files, labels = list(files), list(labels)

unique_classes = sorted(set(labels))
print('Classi (genre_top) rilevate:', unique_classes)
print('Numero campioni totali:', len(labels))

# Determine whether stratify is possible
can_stratify = True
if len(unique_classes) < 2:
    can_stratify = False
else:
    min_count = min(Counter(labels).values())
    if min_count < 2:
        can_stratify = False

strat_arg = labels if can_stratify else None
if not can_stratify:
    print('Stratify disattivato (classi insufficienti o troppo sbilanciate). Userò uno split standard.')

train_val_files, test_files, train_val_labels, test_labels = train_test_split(
    files, labels, test_size=0.2, random_state=42, stratify=strat_arg
)

strat_arg_tv = train_val_labels if can_stratify else None
train_files, val_files, train_labels, val_labels = train_test_split(
    train_val_files, train_val_labels, test_size=0.25, random_state=42, stratify=strat_arg_tv
)

print(f'Train/Val/Test sizes: {len(train_files)}/{len(val_files)}/{len(test_files)}')

# Feature extraction (mirror of GTZAN settings)
def extract(files, labels, sr=22050, n_mels=128, hop_length=512, seg=2.97, n_segments=10):
    X, y = [], []
    seg_len = int(sr * seg)
    errors = 0
    for fp, lab in tqdm(list(zip(files, labels))):
        try:
            with quiet_audioio():
                ysig, _ = librosa.load(fp, sr=sr, mono=True, res_type='kaiser_fast')
            for s in range(n_segments):
                st, en = s * seg_len, s * seg_len + seg_len
                if en <= len(ysig):
                    mel = librosa.feature.melspectrogram(y=ysig[st:en], sr=sr, n_mels=n_mels, hop_length=hop_length)
                    X.append(librosa.power_to_db(mel, ref=np.max))
                    y.append(lab)
        except Exception:
            errors += 1
            continue
    if errors:
        print(f'Avviso: saltati {errors} file per errori di decodifica.')
    return X, y

Xtr_list, ytr_txt = extract(train_files, train_labels)
Xva_list, yva_txt = extract(val_files, val_labels)
Xte_list, yte_txt = extract(test_files, test_labels)

# Unify time dimension to T=128 (pad/crop)
def unify(lst, T=128):
    out = []
    for s in lst:
        if s.shape[1] > T:
            out.append(s[:, :T])
        else:
            out.append(np.pad(s, ((0, 0), (0, T - s.shape[1])), 'constant'))
    return np.array(out)

Xtr, Xva, Xte = unify(Xtr_list), unify(Xva_list), unify(Xte_list)

# Scale features (fit on train only)
scaler = StandardScaler()

def fit_transform_3d(X, fit=False):
    sh = X.shape
    Z = X.reshape(sh[0], -1)
    Z = scaler.fit_transform(Z) if fit else scaler.transform(Z)
    return Z.reshape(sh)

Xtr = fit_transform_3d(Xtr, fit=True)
Xva = fit_transform_3d(Xva)
Xte = fit_transform_3d(Xte)

# Add channel dimension
Xtr = Xtr[..., None]
Xva = Xva[..., None]
Xte = Xte[..., None]

# Encode labels
le = LabelEncoder().fit(unique_classes)
ytr = le.transform(ytr_txt)
yva = le.transform(yva_txt)
yte = le.transform(yte_txt)

# Persist arrays and transformers
np.save(PROCESSED/'X_train.npy', Xtr)
np.save(PROCESSED/'y_train.npy', ytr)
np.save(PROCESSED/'X_val.npy', Xva)
np.save(PROCESSED/'y_val.npy', yva)
np.save(PROCESSED/'X_test.npy', Xte)
np.save(PROCESSED/'y_test.npy', yte)
with open(PROCESSED/'label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
with open(PROCESSED/'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print('Saved processed FMA arrays:', Xtr.shape, Xva.shape, Xte.shape)
print('Class mapping:', dict(zip(le.classes_.tolist(), range(len(le.classes_)))))

Usando metadata: /home/alepot55/Desktop/projects/naml_project/data/fma_download/fma_metadata/tracks.csv
Classi (genre_top) rilevate: ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']
Numero campioni totali: 8000
Train/Val/Test sizes: 4800/1600/1600
Classi (genre_top) rilevate: ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']
Numero campioni totali: 8000
Train/Val/Test sizes: 4800/1600/1600


  7%|▋         | 323/4800 [00:19<05:24, 13.79it/s][src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3328) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3328) too large for available bit count (3240)
  9%|▉         | 423/4800 [00:24<03:10, 22.95it/s][src/libmpg123/layer3.c:INT123_do_layer3():1948] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1948] error: dequantization failed!
 37%|███▋      | 1797/4800 [01:32<02:21, 21.23it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 76%|███████▌  | 3640/4800 [03:12<00:47, 24.36it/s][src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1878] error: part2_3_length (3360) too large for available bit count (3240)
 84%|█

Avviso: saltati 4794 file per errori di decodifica.


 70%|███████   | 1123/1600 [00:59<00:20, 23.55it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 70%|███████   | 1126/1600 [01:00<00:19, 24.70it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
 83%|████████▎ | 1329/1600 [01:08<00:11, 23.74it/s][src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
[src/libmpg123/layer3.c:INT123_do_layer3():1908] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync li

Avviso: saltati 1597 file per errori di decodifica.


 15%|█▍        | 233/1600 [00:12<00:57, 23.79it/s]Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1389] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
100%|██████████| 1600/1600 [01:19<00:00, 20.06it/s]

Avviso: saltati 1595 file per errori di decodifica.
Saved processed FMA arrays: (60, 128, 128, 1) (30, 128, 128, 1) (50, 128, 128, 1)
Class mapping: {'Electronic': 0, 'Experimental': 1, 'Folk': 2, 'Hip-Hop': 3, 'Instrumental': 4, 'International': 5, 'Pop': 6, 'Rock': 7}





In [4]:
# Opzionale: anteprima metadati locale (tracks.csv) ed esempio KaggleHub disabilitato per evitare 404/deprecations
import pandas as pd
from pathlib import Path
import os
import sys

# Resolve PROJECT_ROOT if missing (in case cell 1 wasn't run)
try:
    PROJECT_ROOT
except NameError:
    from pathlib import Path
    PROJECT_ROOT = Path(os.getcwd()).resolve().parents[1]

# Try to use TRACKS_CSV from previous cell; otherwise, fall back to default paths/search
try:
    TRACKS_CSV
except NameError:
    METADATA_ROOT = PROJECT_ROOT/'data'/'fma_metadata'
    TRACKS_CSV = METADATA_ROOT/'tracks.csv'
    if not TRACKS_CSV.exists():
        dl_dir = PROJECT_ROOT/'data'/'fma_download'
        cand = list(dl_dir.rglob('tracks.csv')) if dl_dir.exists() else []
        if cand:
            TRACKS_CSV = cand[0]

print('tracks.csv path candidate:', TRACKS_CSV)
if Path(TRACKS_CSV).exists():
    df_tracks = pd.read_csv(TRACKS_CSV, header=[0,1], index_col=0, low_memory=False)
    print('tracks.csv loaded. Shape:', df_tracks.shape)
    display(df_tracks.head())
else:
    print('tracks.csv non trovato. Esegui le prime celle per scaricare i metadati, oppure posiziona fma_metadata/ in data/.')

# Facoltativo: esempio KaggleHub (disabilitato di default)
USE_KAGGLEHUB = False  # imposta a True solo se sai esattamente cosa stai caricando
DATASET_ID = "mdeff/fma"   # dataset ufficiale; i CSV sono in fma_metadata.zip, non direttamente accessibili via Pandas adapter
FILE_PATH = "tracks.csv"   # nome del file da leggere (se disponibile via adapter)

if USE_KAGGLEHUB:
    try:
        import kagglehub
        # Nota: kagglehub.load_dataset (Pandas adapter) è deprecato e può dare 404 se il file non è esposto dal dataset
        # Percorso consigliato: scaricare l'archivio con kagglehub.dataset_download e poi leggere localmente
        base_path = kagglehub.dataset_download(DATASET_ID)
        base_path = Path(base_path)
        print('KaggleHub cache:', base_path)
        # Se presente, prova a trovare tracks.csv o estrarre fma_metadata.zip
        cand = list(base_path.rglob('tracks.csv'))
        if not cand:
            zips = list(base_path.rglob('fma_metadata*.zip'))
            if zips:
                import zipfile
                with zipfile.ZipFile(zips[0], 'r') as zf:
                    zf.extractall(zips[0].parent)
                cand = list(base_path.rglob('tracks.csv'))
        if cand:
            df_kh = pd.read_csv(cand[0], header=[0,1], index_col=0, low_memory=False)
            print('KaggleHub tracks.csv loaded. Shape:', df_kh.shape)
            display(df_kh.head())
        else:
            print('KaggleHub: tracks.csv non trovato nel dataset scaricato.')
    except Exception as e:
        print('KaggleHub preview failed:', e)

tracks.csv path candidate: /home/alepot55/Desktop/projects/naml_project/data/fma_download/fma_metadata/tracks.csv
tracks.csv loaded. Shape: (106574, 52)


Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level
