# Music Genere Discovery

In [1]:
# ===============================================================
#  PREPROCESSING NOTEBOOK FOR:
#     - GTZAN
#     - FMA SMALL  (SEPARATE)
#     - FMA MEDIUM (SEPARATE)
#     - INSTRUMENTAL (UNLABELLED)
#     - INDIAN MUSIC (NEW)
# ===============================================================

# -------------------------------
# 0) FIX LIBROSA / NUMPY COMPAT
# -------------------------------
import numpy as np
if not hasattr(np, 'complex'):
    np.complex = complex
if not hasattr(np, 'float'):
    np.float = float

In [2]:
# -------------------------------
# 1) IMPORTS
# -------------------------------
import os
import glob
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import librosa

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# -------------------------------
# 2) PATHS (KAGGLE)
# -------------------------------
# gtzan_root = "/home/anirudh-sharma/Desktop/music-genere-presentation/raw-data/gtzan/genres"

# fma_small_root = "/kaggle/input/fma-free-music-archive-small-medium/fma_small/fma_small"
fma_medium_root = "/home/anirudh-sharma/Desktop/music-genere-presentation/raw-data/fma_medium"
fma_meta_csv = "/home/anirudh-sharma/Desktop/music-genere-presentation/raw-data/fma_metadata/tracks.csv"

# instrumental_root = "/kaggle/input/instrumental-music/Beats"

# indian_root = "/kaggle/input/indian-music-dataset/Indian"

# ludwig_root = "/kaggle/input/ludwig-music-dataset-moods-and-subger/mp3/mp3"

# OUTPUT PATH
out_dir = "/home/anirudh-sharma/Desktop/music-genere-presentation/data"
os.makedirs(out_dir, exist_ok=True)

MAX_FILES = None

In [4]:
# -------------------------------
# 3) FEATURE EXTRACTION FUNCTION
# -------------------------------
def extract_features(path, sr=22050, n_mfcc=20, duration=None):
    y, sr = librosa.load(path, sr=sr, duration=duration)
    y, _ = librosa.effects.trim(y)
    duration_s = librosa.get_duration(y=y, sr=sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_std = mfcc.std(axis=1)

    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = chroma.mean(axis=1)
    chroma_std = chroma.std(axis=1)

    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spec_roll = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    zcr = librosa.feature.zero_crossing_rate(y).mean()
    rms = librosa.feature.rms(y=y).mean()

    try:
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    except:
        tempo = np.nan

    feats = {
        "file_path": path,
        "duration": duration_s,
        "sr": sr,
        "spec_centroid_mean": float(spec_cent),
        "spec_rolloff_mean": float(spec_roll),
        "zcr_mean": float(zcr),
        "rms_mean": float(rms),
        "tempo": float(tempo) if not np.isnan(tempo) else np.nan,
    }

    for i, (m, s) in enumerate(zip(mfcc_mean, mfcc_std), 1):
        feats[f"mfcc{i}_mean"] = float(m)
        feats[f"mfcc{i}_std"] = float(s)

    for i, (m, s) in enumerate(zip(chroma_mean, chroma_std), 1):
        feats[f"chroma{i}_mean"] = float(m)
        feats[f"chroma{i}_std"] = float(s)

    return feats


In [5]:
# -------------------------------
# 4) GTZAN PROCESSOR
# -------------------------------
def process_gtzan(root, max_files=None):
    files = sorted(glob.glob(root + "/*/*.wav"))
    if max_files:
        files = files[:max_files]

    rows = []
    for f in tqdm(files, desc="GTZAN"):
        label = Path(f).parent.name
        try:
            feat = extract_features(f)
            feat["dataset"] = "gtzan"
            feat["label"] = label
            feat["subset"] = "gtzan"
            rows.append(feat)
        except Exception as e:
            print("Error:", f, e)
    return pd.DataFrame(rows)

In [6]:
# -------------------------------
# 5) FMA SMALL PROCESSOR
# -------------------------------
def process_fma_small(root, meta_csv=None, max_files=None):
    files = sorted(glob.glob(root + "/**/*.mp3", recursive=True))
    if max_files:
        files = files[:max_files]

    # load metadata if exists
    meta = None
    if meta_csv and os.path.exists(meta_csv):
        try:
            meta = pd.read_csv(meta_csv, index_col=0, low_memory=False)
        except:
            pass

    rows = []
    for f in tqdm(files, desc="FMA_SMALL"):
        label = "unknown"
        fname = Path(f).stem
        if fname.isdigit() and meta is not None:
            tid = int(fname)
            try:
                label = meta.loc[(tid,), ('track', 'genre_top')]
            except:
                pass

        try:
            feat = extract_features(f)
            feat["dataset"] = "fma"
            feat["label"] = label
            feat["subset"] = "fma_small"
            rows.append(feat)
        except Exception as e:
            print("Error:", f, e)

    return pd.DataFrame(rows)


In [7]:
# -------------------------------
# 6) FMA MEDIUM PROCESSOR
# -------------------------------
def process_fma_medium(root, meta_csv=None, max_files=None):
    files = sorted(glob.glob(root + "/**/*.mp3", recursive=True))
    if max_files:
        files = files[:max_files]

    meta = None
    if meta_csv and os.path.exists(meta_csv):
        try:
            meta = pd.read_csv(meta_csv, index_col=0, low_memory=False)
        except:
            pass

    rows = []
    for f in tqdm(files, desc="FMA_MEDIUM"):
        label = "unknown"
        fname = Path(f).stem
        if fname.isdigit() and meta is not None:
            tid = int(fname)
            try:
                label = meta.loc[(tid,), ('track', 'genre_top')]
            except:
                pass

        try:
            feat = extract_features(f)
            feat["dataset"] = "fma"
            feat["label"] = label
            feat["subset"] = "fma_medium"
            rows.append(feat)
        except Exception as e:
            print("Error:", f, e)

    return pd.DataFrame(rows)

In [8]:
# -------------------------------
# 7) INSTRUMENTAL UNLABELLED
# -------------------------------
def process_instrumental(root, max_files=None):
    exts = ("*.mp3", "*.wav", "*.flac", "*.m4a")
    files = []
    for ext in exts:
        files += glob.glob(root + "/**/" + ext, recursive=True)

    files = sorted(files)
    if max_files:
        files = files[:max_files]

    rows = []
    for f in tqdm(files, desc="INSTRUMENTAL"):
        try:
            feat = extract_features(f)
            feat["dataset"] = "instrumental"
            feat["label"] = "instrumental"
            feat["subset"] = "instrumental"
            rows.append(feat)
        except Exception as e:
            print("Error:", f, e)

    return pd.DataFrame(rows)


In [9]:
# -------------------------------
# 8) INDIAN MUSIC PROCESSOR
# -------------------------------
def process_indian(root, max_files=None):
    """
    Process Indian music dataset with genre labels from folder names.
    Expected structure: root/genre_name/*.mp3
    Genres: bollypop, carnatic, ghazal, semiclassical, sufi
    """
    exts = ("*.mp3", "*.wav", "*.flac", "*.m4a")
    files = []
    for ext in exts:
        files += glob.glob(root + "/**/" + ext, recursive=True)

    files = sorted(files)
    if max_files:
        files = files[:max_files]

    rows = []
    for f in tqdm(files, desc="INDIAN"):
        # Extract genre label from parent directory name
        label = Path(f).parent.name
        try:
            feat = extract_features(f)
            feat["dataset"] = "indian"
            feat["label"] = label
            feat["subset"] = "indian"
            rows.append(feat)
        except Exception as e:
            print("Error:", f, e)

    return pd.DataFrame(rows)

In [10]:
# -------------------------------
# 9) LUDWIG MUSIC DATASET PROCESSOR
# -------------------------------
def process_ludwig(root, max_files=None):
    """
    Process Ludwig music dataset with genre/mood labels from folder names.
    Expected structure: root/genre_name/*.mp3
    Genres: blues, classical, electronic, funk_soul, hip hop, jazz, latin, pop, reggae, rock
    """
    exts = ("*.mp3", "*.wav", "*.flac", "*.m4a")
    files = []
    for ext in exts:
        files += glob.glob(root + "/**/" + ext, recursive=True)

    files = sorted(files)
    if max_files:
        files = files[:max_files]

    rows = []
    for f in tqdm(files, desc="LUDWIG"):
        # Extract genre label from parent directory name
        label = Path(f).parent.name
        try:
            feat = extract_features(f)
            feat["dataset"] = "ludwig"
            feat["label"] = label
            feat["subset"] = "ludwig"
            rows.append(feat)
        except Exception as e:
            print("Error:", f, e)

    return pd.DataFrame(rows)

In [None]:
# -------------------------------
# 9.1) CHECK DATASET AVAILABILITY
# -------------------------------
import os

print("=== CHECKING DATASET AVAILABILITY ===\n")

datasets = {
    # "GTZAN": gtzan_root,
    # "FMA Small": fma_small_root,
    "FMA Medium": fma_medium_root,
    # "Instrumental": instrumental_root,
    # "Indian": indian_root,
    # "Ludwig": ludwig_root
}

for name, path in datasets.items():
    if os.path.exists(path):
        # Count files
        exts = ["*.mp3", "*.wav", "*.flac", "*.m4a"]
        file_count = 0
        for ext in exts:
            file_count += len(glob.glob(path + "/**/" + ext, recursive=True))
        
        # List subdirectories (genres)
        try:
            subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
            print(f"✓ {name}: FOUND")
            print(f"  Path: {path}")
            print(f"  Files: {file_count}")
            print(f"  Genres/Folders: {len(subdirs)} - {subdirs[:10]}")  # Show first 10
        except:
            print(f"✓ {name}: FOUND but cannot list contents")
            print(f"  Path: {path}")
            print(f"  Files: {file_count}")
    else:
        print(f"✗ {name}: NOT FOUND")
        print(f"  Expected path: {path}")
    print()

=== CHECKING DATASET AVAILABILITY ===

✓ GTZAN: FOUND
  Path: /home/anirudh-sharma/Desktop/music-genere-presentation/raw-data/gtzan/genres
  Files: 1000
  Genres/Folders: 10 - ['hiphop', 'blues', 'disco', 'country', 'metal', 'jazz', 'classical', 'rock', 'pop', 'reggae']



In [None]:
# -------------------------------
# 10) RUN ALL DATASETS
# -------------------------------
# df_gtzan = process_gtzan(gtzan_root, MAX_FILES)
# df_gtzan.to_csv(out_dir + "/gtzan_features.csv", index=False)
# print(f"Saved GTZAN features: {len(df_gtzan)} tracks")

# df_fma_small = process_fma_small(fma_small_root, fma_meta_csv, MAX_FILES)
# df_fma_small.to_csv(out_dir + "/fma_small_features.csv", index=False)
# print(f"Saved FMA Small features: {len(df_fma_small)} tracks")

df_fma_medium = process_fma_medium(fma_medium_root, fma_meta_csv, MAX_FILES)
df_fma_medium.to_csv(out_dir + "/fma_medium_features.csv", index=False)
print(f"Saved FMA Medium features: {len(df_fma_medium)} tracks")

# df_inst = process_instrumental(instrumental_root, MAX_FILES)
# df_inst.to_csv(out_dir + "/instrumental_features.csv", index=False)
# print(f"Saved Instrumental features: {len(df_inst)} tracks")

# df_indian = process_indian(indian_root, MAX_FILES)
# df_indian.to_csv(out_dir + "/indian_features.csv", index=False)
# print(f"Saved Indian features: {len(df_indian)} tracks")

# df_ludwig = process_ludwig(ludwig_root, MAX_FILES)
# df_ludwig.to_csv(out_dir + "/ludwig_features.csv", index=False)
# print(f"Saved Ludwig features: {len(df_ludwig)} tracks")

GTZAN:   9%|▉         | 94/1000 [00:25<03:09,  4.79it/s]

In [None]:
# -------------------------------
# 11) PRINT SUMMARY
# -------------------------------
print("\n=== SAVED FILES ===")
for f in sorted(os.listdir(out_dir)):
    if f.endswith('.csv'):
        print(f"- {f}")


=== SAVED FILES ===
- fma_medium_features.csv
- fma_small_features.csv
- gtzan_features.csv
- indian_features.csv
- instrumental_features.csv
