In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models




In [2]:
DATASET_ROOT = 'D:\X-ITE Pain'
CLASS_NAMES = ['low_pain', 'medium_pain']
AUDIO_SUBDIR = "audio"
AUDIO_EXT = ".wav"
SAMPLE_RATE = 16000

In [3]:
yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")













In [4]:
def extract_embedding(audio_file):
    wav, sr = librosa.load(audio_file, sr=SAMPLE_RATE)
    if wav.ndim > 1:
        wav = librosa.to_mono(wav)
    waveform = wav.astype(np.float32)
    _, embeddings, _ = yamnet_model(waveform)
    return tf.reduce_mean(embeddings, axis=0).numpy()

def collect_files_and_labels(dataset_root):
    files, labels = [], []
    for cls in CLASS_NAMES:
        cls_audio_dir = os.path.join(dataset_root, cls, AUDIO_SUBDIR)
        if not os.path.isdir(cls_audio_dir):
            continue
        # Recursively search for .wav files in all subject subfolders
        for root, dirs, filenames in os.walk(cls_audio_dir):
            for fname in filenames:
                if fname.endswith(AUDIO_EXT):
                    files.append(os.path.join(root, fname))
                    labels.append(cls)
    return files, labels

In [5]:
files, labels = collect_files_and_labels(DATASET_ROOT)
print(f"Total audio samples found: {len(files)}")
for cls in CLASS_NAMES:
    print(f"{cls}: {labels.count(cls)} samples")

Total audio samples found: 2638
low_pain: 1318 samples
medium_pain: 1320 samples


In [6]:
embeddings = []
for f in tqdm(files, desc="Extracting embeddings"):
    emb = extract_embedding(f)
    embeddings.append(emb)
X = np.stack(embeddings)
y = np.array(labels)

Extracting embeddings: 100%|██████████| 2638/2638 [01:28<00:00, 29.71it/s]


In [7]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

Training samples: 2110
Test samples: 528
Label mapping: {np.str_('low_pain'): np.int64(0), np.str_('medium_pain'): np.int64(1)}


In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Set max_iter to 10 (epochs)
mlp = MLPClassifier(hidden_layer_sizes=(256,32), max_iter=350, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate on test set
y_pred = mlp.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Classification Report:

              precision    recall  f1-score   support

    low_pain       0.50      0.39      0.43       264
 medium_pain       0.50      0.61      0.55       264

    accuracy                           0.50       528
   macro avg       0.50      0.50      0.49       528
weighted avg       0.50      0.50      0.49       528

Confusion Matrix:

[[102 162]
 [104 160]]
