In [3]:
import pandas as pd
import os
import numpy as np
import librosa
from tqdm import tqdm

# Load train.csv
train_meta = pd.read_csv('../train.csv')

# Path to training audio files
train_audio_path = '../audio_data/audios_train/'

# Define feature extraction function
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfcc.T, axis=0)

# Lists to store features and labels
features = []
labels = []

missing_files = 0
processed_files = 0

# Loop over each row in the CSV
for _, row in tqdm(train_meta.iterrows(), total=len(train_meta)):
    filename = row['filename']
    label = row['label']
    file_path = os.path.join(train_audio_path, filename)

    if os.path.exists(file_path):
        try:
            mfccs = extract_features(file_path)
            features.append(mfccs)
            labels.append(label)
            processed_files += 1
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    else:
        print(f"Missing file: {file_path}")
        missing_files += 1

print(f"\n✅ Done! Processed {processed_files} files.")
print(f"❌ Missing or errored files: {missing_files}")

# Save only if features were extracted
if features:
    X = np.array(features)
    y = np.array(labels)

    print("X shape:", X.shape)
    print("y shape:", y.shape)

    np.save('../models/X.npy', X)
    np.save('../models/y.npy', y)
else:
    print("No features extracted. Please check your audio files and CSV.")


100%|█| 444/444 [00:46<00:00,  9.61it


✅ Done! Processed 444 files.
❌ Missing or errored files: 0
X shape: (444, 13)
y shape: (444,)



