# Segment audio and extract MFCCs
This notebook reads the split CSVs, segments each track, and saves MFCC features per segment.


In [3]:
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder

SPLIT_DIR = Path('..') / 'data' / 'splits'
OUT_DIR = Path('..') / 'data' / 'processed'
OUT_DIR.mkdir(parents=True, exist_ok=True)

SR = 22050
SEG_SECONDS = 5.0
HOP_SECONDS = 2.5
N_MFCC = 20
N_FFT = 2048
HOP_LENGTH = 512

MAX_FRAMES = librosa.time_to_frames(SEG_SECONDS, sr=SR, hop_length=HOP_LENGTH)

def segment_audio(y, sr, seg_seconds, hop_seconds):
    seg_len = int(seg_seconds * sr)
    hop_len = int(hop_seconds * sr)
    if len(y) < seg_len:
        return [np.pad(y, (0, seg_len - len(y)))]
    segments = []
    for start in range(0, len(y) - seg_len + 1, hop_len):
        segments.append(y[start:start + seg_len])
    return segments

def mfcc_segment(y, sr):
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH
    )
    if mfcc.shape[1] < MAX_FRAMES:
        pad = MAX_FRAMES - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad)), mode='constant')
    else:
        mfcc = mfcc[:, :MAX_FRAMES]
    return mfcc.astype(np.float32)

def build_features(split_csv, encoder=None):
    df = pd.read_csv(split_csv)
    X, y, track_ids = [], [], []
    for _, row in df.iterrows():
        try:
            y_audio, _ = librosa.load(row['path'], sr=SR)
        except Exception as exc:
            print(f"Skipping unreadable file: {row['path']} ({exc})")
            continue
        segments = segment_audio(y_audio, SR, SEG_SECONDS, HOP_SECONDS)
        for idx, seg in enumerate(segments):
            mfcc = mfcc_segment(seg, SR)
            X.append(mfcc)
            y.append(row['label'])
            track_ids.append(row['track_id'])
    X = np.stack(X)
    y = np.array(y)
    if encoder is None:
        encoder = LabelEncoder()
        y_enc = encoder.fit_transform(y)
    else:
        y_enc = encoder.transform(y)
    return X, y_enc, track_ids, encoder

X_train, y_train, train_ids, encoder = build_features(SPLIT_DIR / 'gtzan_train.csv')
X_val, y_val, val_ids, _ = build_features(SPLIT_DIR / 'gtzan_val.csv', encoder=encoder)
X_test, y_test, test_ids, _ = build_features(SPLIT_DIR / 'gtzan_test.csv', encoder=encoder)

np.save(OUT_DIR / 'X_train.npy', X_train)
np.save(OUT_DIR / 'y_train.npy', y_train)
np.save(OUT_DIR / 'X_val.npy', X_val)
np.save(OUT_DIR / 'y_val.npy', y_val)
np.save(OUT_DIR / 'X_test.npy', X_test)
np.save(OUT_DIR / 'y_test.npy', y_test)
np.save(OUT_DIR / 'train_ids.npy', np.array(train_ids, dtype=object))
np.save(OUT_DIR / 'val_ids.npy', np.array(val_ids, dtype=object))
np.save(OUT_DIR / 'test_ids.npy', np.array(test_ids, dtype=object))
np.save(OUT_DIR / 'classes.npy', encoder.classes_)

print('Saved features to', OUT_DIR)
print('Train shape:', X_train.shape, 'Val shape:', X_val.shape, 'Test shape:', X_test.shape)


  y_audio, _ = librosa.load(row['path'], sr=SR)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Skipping unreadable file: C:\sem 07\sem07 frontend development\e20-co542-classical-music-classification\code\GTZN\Data\genres_original\jazz\jazz.00054.wav ()
Saved features to ..\data\processed
Train shape: (7021, 20, 215) Val shape: (1759, 20, 215) Test shape: (2200, 20, 215)
