In [1]:
!pip install opendatasets librosa soundfile

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


## Import Library

In [2]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/franciscoaliss/music-instrument-stems")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: bondantm
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/franciscoaliss/music-instrument-stems
Downloading music-instrument-stems.zip to ./music-instrument-stems


100%|██████████| 912M/912M [00:09<00:00, 100MB/s] 





In [3]:
import os
import librosa
import soundfile as sf

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

In [4]:
acoustic_guitar_dir = '/content/music-instrument-stems/mega_augmented_ds/mega_augmented_ds/Acoustic Guitar'
piano_dir = '/content/music-instrument-stems/mega_augmented_ds/mega_augmented_ds/Piano'

audio_data = []
labels = []

def load_audio_data(directory, label):
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            filepath = os.path.join(directory, filename)
            try:
                # Load audio file
                data, sample_rate = librosa.load(filepath)
                audio_data.append(data)
                labels.append(label)
            except Exception as e:
                print(f"Error loading {filepath}: {e}")

load_audio_data(acoustic_guitar_dir, 'Acoustic Guitar')
load_audio_data(piano_dir, 'Piano')

# You can now process the audio_data and labels further as needed
print(f"Loaded {len(audio_data)} audio files.")
print(f"Labels: {np.unique(labels)}")

Loaded 2128 audio files.
Labels: ['Acoustic Guitar' 'Piano']


## Feature extraction

### Alasan:
Mengambil fitur yang relevan dari data audio yang telah dimuat, khususnya Mel-Frequency Cepstral Coefficients (MFCCs).


In [5]:
features = []
for data in audio_data:
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=data, sr=22050) # Using default sample rate of librosa
    # Calculate the mean of MFCCs across the time axis
    mfccs_processed = np.mean(mfccs.T, axis=0)
    features.append(mfccs_processed)

# Convert the features list to a NumPy array
features = np.array(features)

print(f"Extracted features shape: {features.shape}")

Extracted features shape: (2128, 20)


## Under sampling

### Alasan:
Memeriksa dan menyeimbangkan distribusi jumlah sampel di antara kelas-kelas target (yaitu, kelas "Gitar" dan kelas "Piano") menggunakan teknik undersampling.


In [6]:
unique_labels, counts = np.unique(labels, return_counts=True)
print("Label distribution before undersampling:")
for label, count in zip(unique_labels, counts):
    print(f"{label}: {count}")

# Check for imbalance (e.g., if the ratio between the largest and smallest class is significant)
if counts.max() / counts.min() > 1.5: # Using a threshold of 1.5 as an example for imbalance
    print("\nDataset is imbalanced, applying undersampling.")
    from imblearn.under_sampling import RandomUnderSampler
    rus = RandomUnderSampler(random_state=42)
    features_resampled, labels_resampled = rus.fit_resample(features, labels)
    features = features_resampled
    labels = labels_resampled
    print("Label distribution after undersampling:")
    unique_labels, counts = np.unique(labels, return_counts=True)
    for label, count in zip(unique_labels, counts):
        print(f"{label}: {count}")
else:
    print("\nDataset is balanced or the imbalance is not significant enough for undersampling.")


Label distribution before undersampling:
Acoustic Guitar: 836
Piano: 1292

Dataset is imbalanced, applying undersampling.
Label distribution after undersampling:
Acoustic Guitar: 836
Piano: 836


## Data splitting

### Alasan:
Pembagian data memastikan model belajar dari satu set data dan diuji keandalannya pada set data lain, sementara konversi label memastikan data berada dalam format numerik yang efisien untuk komputasi model.


In [8]:
y_labels_array = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(features, y_labels_array, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1337, 20)
Shape of X_test: (335, 20)
Shape of y_train: (1337,)
Shape of y_test: (335,)


## Label Encoding

### Alasan:
mengubah label kategori yang berupa teks atau string (misalnya, 'Gitar Akustik', 'Piano') menjadi format numerik (misalnya, 0 dan 1).


In [9]:
le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

print("Unique values in y_train_encoded:", np.unique(y_train_encoded))
print("Unique values in y_test_encoded:", np.unique(y_test_encoded))
print("Mapping of labels:", list(le.classes_))


Unique values in y_train_encoded: [0 1]
Unique values in y_test_encoded: [0 1]
Mapping of labels: [np.str_('Acoustic Guitar'), np.str_('Piano')]


## Prepare features for MobileNetV2

### Alasan:
Mengubah fitur MFCC yang telah diekstrak (data audio) menjadi format yang dapat diproses oleh arsitektur MobileNetV2, yang pada dasarnya adalah model klasifikasi gambar.

In [10]:
TARGET_HEIGHT = 96
TARGET_WIDTH = 96
TARGET_CHANNELS = 3 # MobileNetV2 expects 3 channels (RGB)

def prepare_features_for_mobilenet(features, target_h, target_w, target_c):
    # Reshape from (num_samples, 20) to (num_samples, 20, 1, 1)
    # This treats each sample's 20 features as a 20x1 "image" with 1 channel
    reshaped_features = features[:, :, np.newaxis, np.newaxis] # Shape (num_samples, 20, 1, 1)

    current_height = reshaped_features.shape[1] # 20
    current_width = reshaped_features.shape[2] # 1

    pad_h = target_h - current_height
    pad_w = target_w - current_width

    if pad_h < 0 or pad_w < 0:
        # This should not happen with TARGET_HEIGHT=96, TARGET_WIDTH=96 and current dimensions (20,1)
        raise ValueError("Target dimensions must be greater than or equal to current dimensions.")

    # Pad height and width with zeros to reach target_h and target_w
    # The padding is applied symmetrically (0 before, pad_h/pad_w after) to fill the space
    padded_features = np.pad(reshaped_features,
                             ((0, 0), (0, pad_h), (0, pad_w), (0, 0)),
                             mode='constant',
                             constant_values=0)
    # Shape after padding: (num_samples, target_h, target_w, 1)

    # Replicate the single channel to 3 channels (RGB) by repeating along the last axis
    final_features = np.repeat(padded_features, target_c, axis=-1)
    # Shape after replication: (num_samples, target_h, target_w, target_c)

    return final_features

X_train_processed = prepare_features_for_mobilenet(X_train, TARGET_HEIGHT, TARGET_WIDTH, TARGET_CHANNELS)
X_test_processed = prepare_features_for_mobilenet(X_test, TARGET_HEIGHT, TARGET_WIDTH, TARGET_CHANNELS)

print(f"Shape of X_train_processed: {X_train_processed.shape}")
print(f"Shape of X_test_processed: {X_test_processed.shape}")

Shape of X_train_processed: (1337, 96, 96, 3)
Shape of X_test_processed: (335, 96, 96, 3)


## Build MobileNetV2-based Classifier

### Alasan:
Membangun model klasifikasi dengan menggunakan arsitektur MobileNetV2 yang sudah dilatih (pre-trained) sebagai kerangka dasar, dan kemudian memodifikasi lapisan akhirnya agar sesuai dengan tugas klasifikasi biner Anda (Gitar vs. Piano).

In [11]:
# Load the pre-trained MobileNetV2 model without the top classification layer
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(TARGET_HEIGHT, TARGET_WIDTH, TARGET_CHANNELS))

# Add custom classification layers on top of MobileNetV2
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x) # Additional dense layer for more capacity (optional)
predictions = Dense(1, activation='sigmoid')(x) # Output layer for binary classification

# Create the full model
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze the layers of the pre-trained MobileNetV2 base model
for layer in base_model.layers:
    layer.trainable = False

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_96_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Compile and Train the Model

### Alasan:
Tahap implementasi inti, di mana model MobileNetV2 yang sudah dimodifikasi disiapkan (dikompilasi) dan diajarkan (dilatih) menggunakan data fitur MFCC yang sudah disiapkan.

In [12]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_processed, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_processed, y_test_encoded))

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_processed, y_test_encoded)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 448ms/step - accuracy: 0.5430 - loss: 1.0977 - val_accuracy: 0.6716 - val_loss: 0.5636
Epoch 2/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 327ms/step - accuracy: 0.7550 - loss: 0.5009 - val_accuracy: 0.8209 - val_loss: 0.4568
Epoch 3/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 298ms/step - accuracy: 0.8047 - loss: 0.4524 - val_accuracy: 0.7552 - val_loss: 0.4967
Epoch 4/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 289ms/step - accuracy: 0.8456 - loss: 0.3940 - val_accuracy: 0.8299 - val_loss: 0.4114
Epoch 5/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 288ms/step - accuracy: 0.8477 - loss: 0.3697 - val_accuracy: 0.8299 - val_loss: 0.4052
Epoch 6/10
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 281ms/step - accuracy: 0.8353 - loss: 0.3789 - val_accuracy: 0.8597 - val_loss: 0.3733
Epoch 7/10
[1m42/42[