In [60]:
"""
Step 1: Create labels.csv for low_pain and medium_pain classes

This script scans your audio dataset, assuming the following folder structure:
    D:\X-ITE Pain\low_pain\audio\{subject}\{file_name.wav}
    D:\X-ITE Pain\medium_pain\audio\{subject}\{file_name.wav}

It creates a CSV file with columns:
    pain_level, subject, file_name, audio_path

Only "low_pain" and "medium_pain" classes are included.

Instructions:
- Update BASE_DIR if your data is elsewhere.
- Run this script. It creates labels.csv in BASE_DIR.
"""

import os
import pandas as pd

BASE_DIR = r'D:\X-ITE Pain'  # Change this if your data is elsewhere
PAIN_CLASSES = {"low_pain", "medium_pain"}  # Only use these classes

rows = []
for pain_level in os.listdir(BASE_DIR):
    if pain_level not in PAIN_CLASSES:
        continue
    pain_path = os.path.join(BASE_DIR, pain_level, 'audio')
    if not os.path.isdir(pain_path):
        continue
    for subject in os.listdir(pain_path):
        subject_path = os.path.join(pain_path, subject)
        if not os.path.isdir(subject_path):
            continue
        for f in os.listdir(subject_path):
            if f.lower().endswith('.wav'):
                rows.append({
                    'pain_level': pain_level,
                    'subject': subject,
                    'file_name': f,
                    'audio_path': os.path.join(subject_path, f)
                })

df = pd.DataFrame(rows)
labels_path = os.path.join(BASE_DIR, 'labels.csv')
df.to_csv(labels_path, index=False)
print(f"Created {labels_path} with {len(df)} rows and columns: {df.columns.tolist()}")

Created D:\X-ITE Pain\labels.csv with 3117 rows and columns: ['pain_level', 'subject', 'file_name', 'audio_path']


In [61]:
"""
Step 2: Extract advanced audio features for low_pain and medium_pain samples

- Loads labels.csv from Step 1.
- For each audio file, extracts:
    - MFCCs (mean, std, min, max, 13 coefficients)
    - Delta MFCCs (mean, std, 13 coefficients)
    - Chroma STFT (mean, std, 12 coefficients)
    - Mel Spectrogram (mean, std, min, max, 128 bands)
    - Spectral Contrast (mean, std, 7 bands)
    - Tonnetz (mean, std, 6 dimensions)
    - Spectral Centroid (mean, std)
    - Spectral Bandwidth (mean, std)
    - Spectral Rolloff (mean, std)
    - Zero Crossing Rate (mean, std)
    - RMS energy (mean, std)
- Concatenates all features into one row per audio file.
- Saves the features as advanced_features.csv in your BASE_DIR.

Instructions:
- Install: librosa, numpy, pandas
- Update BASE_DIR if your data is elsewhere.
- Run after Step 1.
"""

import pandas as pd
import numpy as np
import librosa

BASE_DIR = r'D:\X-ITE Pain'  # Change if necessary
labels_path = f"{BASE_DIR}\\labels.csv"
FEATURES_PATH = f"{BASE_DIR}\\advanced_features.csv"
SR = 44100
N_MFCC = 13
N_MELS = 128

def extract_features(audio_path):
    try:
        y, _ = librosa.load(audio_path, sr=SR)
        features = []

        # MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=N_MFCC)
        features.extend(np.mean(mfcc, axis=1))
        features.extend(np.std(mfcc, axis=1))
        features.extend(np.min(mfcc, axis=1))
        features.extend(np.max(mfcc, axis=1))

        # Delta MFCCs
        mfcc_delta = librosa.feature.delta(mfcc)
        features.extend(np.mean(mfcc_delta, axis=1))
        features.extend(np.std(mfcc_delta, axis=1))

        # Chroma STFT
        chroma = librosa.feature.chroma_stft(y=y, sr=SR)
        features.extend(np.mean(chroma, axis=1))
        features.extend(np.std(chroma, axis=1))

        # Mel Spectrogram (log-mel)
        mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS)
        log_mel = librosa.power_to_db(mel)
        features.extend(np.mean(log_mel, axis=1))
        features.extend(np.std(log_mel, axis=1))
        features.extend(np.min(log_mel, axis=1))
        features.extend(np.max(log_mel, axis=1))

        # Spectral Contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=SR)
        features.extend(np.mean(contrast, axis=1))
        features.extend(np.std(contrast, axis=1))

        # Tonnetz
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=SR)
        features.extend(np.mean(tonnetz, axis=1))
        features.extend(np.std(tonnetz, axis=1))

        # Spectral centroid
        centroid = librosa.feature.spectral_centroid(y=y, sr=SR)
        features.append(np.mean(centroid))
        features.append(np.std(centroid))

        # Spectral bandwidth
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=SR)
        features.append(np.mean(bandwidth))
        features.append(np.std(bandwidth))

        # Spectral rolloff
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=SR)
        features.append(np.mean(rolloff))
        features.append(np.std(rolloff))

        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(y)
        features.append(np.mean(zcr))
        features.append(np.std(zcr))

        # RMS energy
        rms = librosa.feature.rms(y=y)
        features.append(np.mean(rms))
        features.append(np.std(rms))

        return features
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        # Return NaNs for all features if error
        n_features = (N_MFCC*4 + N_MFCC*2 + 12*2 + N_MELS*4 + 7*2 + 6*2 + 2*5)
        return [np.nan] * n_features

# Build feature column names
feature_names = (
    [f"mfcc_mean_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_std_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_min_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_max_{i+1}" for i in range(N_MFCC)] +

    [f"mfcc_delta_mean_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_delta_std_{i+1}" for i in range(N_MFCC)] +

    [f"chroma_mean_{i+1}" for i in range(12)] +
    [f"chroma_std_{i+1}" for i in range(12)] +

    [f"mel_mean_{i+1}" for i in range(N_MELS)] +
    [f"mel_std_{i+1}" for i in range(N_MELS)] +
    [f"mel_min_{i+1}" for i in range(N_MELS)] +
    [f"mel_max_{i+1}" for i in range(N_MELS)] +

    [f"contrast_mean_{i+1}" for i in range(7)] +
    [f"contrast_std_{i+1}" for i in range(7)] +

    [f"tonnetz_mean_{i+1}" for i in range(6)] +
    [f"tonnetz_std_{i+1}" for i in range(6)] +

    ["centroid_mean", "centroid_std"] +
    ["bandwidth_mean", "bandwidth_std"] +
    ["rolloff_mean", "rolloff_std"] +
    ["zcr_mean", "zcr_std"] +
    ["rms_mean", "rms_std"]
)

df = pd.read_csv(labels_path)
features = [extract_features(row['audio_path']) for _, row in df.iterrows()]
feat_df = pd.DataFrame(features, columns=feature_names)
result_df = pd.concat([df, feat_df], axis=1)
result_df.to_csv(FEATURES_PATH, index=False)
print(f"Advanced feature extraction complete. Saved to {FEATURES_PATH} with shape {result_df.shape}")

Advanced feature extraction complete. Saved to D:\X-ITE Pain\advanced_features.csv with shape (3117, 654)


In [62]:
"""
Step 3: Stratified Train/Test Split for low_pain and medium_pain

- Loads mfcc_features.csv produced in Step 2.
- Performs a stratified split (so both classes are balanced in train and test).
- Saves train_features.csv and test_features.csv in your BASE_DIR.

Instructions:
- Make sure BASE_DIR is set correctly.
- Run this script after Step 2.
"""

import pandas as pd
from sklearn.model_selection import train_test_split

BASE_DIR = r'D:\X-ITE Pain'  # Change if necessary
FEATURES_PATH = f"{BASE_DIR}\\advanced_features.csv"
TRAIN_PATH = f"{BASE_DIR}\\train_features.csv"
TEST_PATH = f"{BASE_DIR}\\test_features.csv"

# Load features
df = pd.read_csv(FEATURES_PATH)

# Drop rows with missing features (if any)
df = df.dropna()

# Stratified split
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['pain_level'],
    random_state=42,
)

# Save splits
train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)
print(f"Train set: {len(train_df)} samples | Test set: {len(test_df)} samples")
print(f"Saved: {TRAIN_PATH} and {TEST_PATH}")

Train set: 2493 samples | Test set: 624 samples
Saved: D:\X-ITE Pain\train_features.csv and D:\X-ITE Pain\test_features.csv


In [78]:
"""
Step 4: Train and evaluate an XGBoost model for low_pain vs. medium_pain (with label encoding)

- Loads train_features.csv and test_features.csv from Step 3.
- Encodes string labels to integers (required for XGBoost).
- Standardizes features using scikit-learn's StandardScaler.
- Trains an XGBoost classifier.
- Prints classification report and confusion matrix (with original string labels).
- Saves the trained model and the label encoder.

Instructions:
- Make sure you have installed: xgboost, scikit-learn, joblib, pandas, numpy
- Update BASE_DIR if your data is elsewhere.
- Run this script after Step 3.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

BASE_DIR = r'D:\X-ITE Pain'  # Change if necessary
TRAIN_PATH = f"{BASE_DIR}\\train_features.csv"
TEST_PATH = f"{BASE_DIR}\\test_features.csv"
MODEL_PATH = f"{BASE_DIR}\\xgb_audio_model.joblib"
ENCODER_PATH = f"{BASE_DIR}\\label_encoder.joblib"

# Load train/test sets
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

# Identify feature columns (exclude metadata)
exclude_cols = {'pain_level', 'subject', 'file_name', 'audio_path'}
feature_cols = [c for c in train_df.columns if c not in exclude_cols]

X_train = train_df[feature_cols].values
X_test = test_df[feature_cols].values

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['pain_level'].values)
y_test = le.transform(test_df['pain_level'].values)

# Build pipeline: scaler + XGBoost
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_estimators=300,
        max_depth=6,
        learning_rate=0.03,
        n_jobs=-1,
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Convert predictions back to string labels for reporting
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_labels, y_pred_labels))

# Save the trained model and label encoder
#joblib.dump(pipeline, MODEL_PATH)
#joblib.dump(le, ENCODER_PATH)
#print(f"\nModel saved as {MODEL_PATH}")
#print(f"Label encoder saved as {ENCODER_PATH}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:
               precision    recall  f1-score   support

    low_pain       0.59      0.57      0.58       312
 medium_pain       0.59      0.61      0.60       312

    accuracy                           0.59       624
   macro avg       0.59      0.59      0.59       624
weighted avg       0.59      0.59      0.59       624


Confusion Matrix:
 [[178 134]
 [122 190]]


In [79]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
import joblib

# === Paths ===
TRAIN_FEATURES_PATH = "train_features.csv"
TEST_FEATURES_PATH = "test_features.csv"
FINAL_MODEL_PATH = "xgb_final_trained_all_features.joblib"
FINAL_ENCODER_PATH = "label_encoder_final.joblib"
FINAL_SCALER_PATH = "scaler_final.joblib"

# === Load train and test data ===
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

# === Combine train and test for full data retraining ===
full_df = pd.concat([train_df, test_df], ignore_index=True)

# === Separate features and labels ===
feature_cols = [col for col in full_df.columns if col not in ['pain_level', 'subject', 'file_name', 'audio_path']]
X_full = full_df[feature_cols].values
y_full = full_df['pain_level']

# === Encode labels ===
le = LabelEncoder()
y_full_encoded = le.fit_transform(y_full)

# === Standardize features ===
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

# === Train final model ===
final_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
final_model.fit(X_full_scaled, y_full_encoded)

# === Save model, encoder, and scaler ===
joblib.dump(final_model, FINAL_MODEL_PATH)
joblib.dump(le, FINAL_ENCODER_PATH)
joblib.dump(scaler, FINAL_SCALER_PATH)

print(f"Final model saved as: {FINAL_MODEL_PATH}")
print(f"Label encoder saved as: {FINAL_ENCODER_PATH}")
print(f"Scaler saved as: {FINAL_SCALER_PATH}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final model saved as: xgb_final_trained_all_features.joblib
Label encoder saved as: label_encoder_final.joblib
Scaler saved as: scaler_final.joblib


In [None]:
"""
Step 5: Prepare and Unlabel Your Test Set for Prediction

- Scans the new test dataset directory with the following structure:
    D:\test\<pain level>\<Subject name>\<audio file>
- Extracts subject, file name, and full audio path into a new CSV.
- Ignores the pain level column (or sets as 'unknown') for prediction.
- Output: test_unlabelled.csv with columns: subject, file_name, audio_path

Instructions:
- Update TEST_BASE_DIR if your test data is elsewhere.
- This file will be used for feature extraction and prediction.
"""

import os
import pandas as pd

TEST_BASE_DIR = r'D:\test'  # Update if necessary
rows = []

# Loop through the directory structure
for pain_level in os.listdir(TEST_BASE_DIR):
    pain_dir = os.path.join(TEST_BASE_DIR, pain_level)
    if not os.path.isdir(pain_dir):
        continue
    for subject in os.listdir(pain_dir):
        subject_dir = os.path.join(pain_dir, subject)
        if not os.path.isdir(subject_dir):
            continue
        for f in os.listdir(subject_dir):
            if f.lower().endswith('.wav'):
                rows.append({
                    'subject': subject,
                    'file_name': f,
                    'audio_path': os.path.join(subject_dir, f)
                    # Optionally, add 'pain_level': pain_level  # If you want to keep actual for later comparison
                })

# Save as CSV (no label column for prediction)
test_unlabelled_path = os.path.join(TEST_BASE_DIR, 'test_unlabelled.csv')
df = pd.DataFrame(rows)
df.to_csv(test_unlabelled_path, index=False)
print(f"Created {test_unlabelled_path} with {len(df)} files ready for feature extraction and prediction.")

In [None]:
"""
Step 6: Extract Features from Unlabelled Test Audio

- Loads test_unlabelled.csv from Step 5.
- For each audio file, extracts the same features as used for training:
    - MFCCs (mean, std, min, max, 13 coefficients)
    - Delta MFCCs (mean, std, 13 coefficients)
    - Chroma STFT (mean, std, 12 coefficients)
    - Mel Spectrogram (mean, std, min, max, 128 bands)
    - Spectral Contrast (mean, std, 7 bands)
    - Tonnetz (mean, std, 6 dimensions)
    - Spectral Centroid (mean, std)
    - Spectral Bandwidth (mean, std)
    - Spectral Rolloff (mean, std)
    - Zero Crossing Rate (mean, std)
    - RMS energy (mean, std)
- Concatenates all features into one row per audio file.
- Saves results as test_unlabelled_features.csv in your test data directory.

Instructions:
- Requires: librosa, numpy, pandas
- Update TEST_BASE_DIR if your data is elsewhere.
- Run after Step 5.
"""

import pandas as pd
import numpy as np
import librosa
import os

TEST_BASE_DIR = r'D:\test'  # Update if necessary
UNLABELLED_CSV = os.path.join(TEST_BASE_DIR, 'test_unlabelled.csv')
FEATURES_OUT = os.path.join(TEST_BASE_DIR, 'test_unlabelled_features.csv')
SR = 44100
N_MFCC = 13
N_MELS = 128

def extract_features(audio_path):
    try:
        y, _ = librosa.load(audio_path, sr=SR)
        features = []

        # MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=SR, n_mfcc=N_MFCC)
        features.extend(np.mean(mfcc, axis=1))
        features.extend(np.std(mfcc, axis=1))
        features.extend(np.min(mfcc, axis=1))
        features.extend(np.max(mfcc, axis=1))

        # Delta MFCCs
        mfcc_delta = librosa.feature.delta(mfcc)
        features.extend(np.mean(mfcc_delta, axis=1))
        features.extend(np.std(mfcc_delta, axis=1))

        # Chroma STFT
        chroma = librosa.feature.chroma_stft(y=y, sr=SR)
        features.extend(np.mean(chroma, axis=1))
        features.extend(np.std(chroma, axis=1))

        # Mel Spectrogram (log-mel)
        mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS)
        log_mel = librosa.power_to_db(mel)
        features.extend(np.mean(log_mel, axis=1))
        features.extend(np.std(log_mel, axis=1))
        features.extend(np.min(log_mel, axis=1))
        features.extend(np.max(log_mel, axis=1))

        # Spectral Contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=SR)
        features.extend(np.mean(contrast, axis=1))
        features.extend(np.std(contrast, axis=1))

        # Tonnetz
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=SR)
        features.extend(np.mean(tonnetz, axis=1))
        features.extend(np.std(tonnetz, axis=1))

        # Spectral centroid
        centroid = librosa.feature.spectral_centroid(y=y, sr=SR)
        features.append(np.mean(centroid))
        features.append(np.std(centroid))

        # Spectral bandwidth
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=SR)
        features.append(np.mean(bandwidth))
        features.append(np.std(bandwidth))

        # Spectral rolloff
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=SR)
        features.append(np.mean(rolloff))
        features.append(np.std(rolloff))

        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(y)
        features.append(np.mean(zcr))
        features.append(np.std(zcr))

        # RMS energy
        rms = librosa.feature.rms(y=y)
        features.append(np.mean(rms))
        features.append(np.std(rms))

        return features
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        # Return NaNs for all features if error
        n_features = (N_MFCC*4 + N_MFCC*2 + 12*2 + N_MELS*4 + 7*2 + 6*2 + 2*5)
        return [np.nan] * n_features

# Build feature column names
feature_names = (
    [f"mfcc_mean_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_std_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_min_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_max_{i+1}" for i in range(N_MFCC)] +

    [f"mfcc_delta_mean_{i+1}" for i in range(N_MFCC)] +
    [f"mfcc_delta_std_{i+1}" for i in range(N_MFCC)] +

    [f"chroma_mean_{i+1}" for i in range(12)] +
    [f"chroma_std_{i+1}" for i in range(12)] +

    [f"mel_mean_{i+1}" for i in range(N_MELS)] +
    [f"mel_std_{i+1}" for i in range(N_MELS)] +
    [f"mel_min_{i+1}" for i in range(N_MELS)] +
    [f"mel_max_{i+1}" for i in range(N_MELS)] +

    [f"contrast_mean_{i+1}" for i in range(7)] +
    [f"contrast_std_{i+1}" for i in range(7)] +

    [f"tonnetz_mean_{i+1}" for i in range(6)] +
    [f"tonnetz_std_{i+1}" for i in range(6)] +

    ["centroid_mean", "centroid_std"] +
    ["bandwidth_mean", "bandwidth_std"] +
    ["rolloff_mean", "rolloff_std"] +
    ["zcr_mean", "zcr_std"] +
    ["rms_mean", "rms_std"]
)

# Load the unlabelled test metadata
df = pd.read_csv(UNLABELLED_CSV)
features = [extract_features(row['audio_path']) for _, row in df.iterrows()]
feat_df = pd.DataFrame(features, columns=feature_names)
result_df = pd.concat([df, feat_df], axis=1)
result_df.to_csv(FEATURES_OUT, index=False)
print(f"Advanced feature extraction for test set complete. Saved to {FEATURES_OUT} with shape {result_df.shape}")

In [None]:
"""
Step 7: Predict Pain Levels on Unlabelled Test Audio Features

- Loads extracted features from test_unlabelled_features.csv (from Step 6).
- Loads the trained final model, label encoder, and scaler.
- Applies **exact same scaling and feature order** as during training.
- Predicts pain levels for each unlabelled sample.
- Saves results as test_predictions.csv in the test directory, including:
    subject, file_name, audio_path, predicted_pain_level

Instructions:
- Make sure test_unlabelled_features.csv, model, scaler, and label encoder are in the correct paths.
- Run after Step 6.
"""

import os
import pandas as pd
import joblib

# === Paths ===
TEST_BASE_DIR = r'D:\test'
FEATURES_PATH = os.path.join(TEST_BASE_DIR, 'test_unlabelled_features.csv')
MODEL_PATH = r'D:\X-ITE Pain\xgb_final_trained_all_features.joblib'
ENCODER_PATH = r'D:\X-ITE Pain\label_encoder_final.joblib'
SCALER_PATH = r'D:\X-ITE Pain\scaler_final.joblib'
OUTPUT_PATH = os.path.join(TEST_BASE_DIR, 'test_predictions.csv')

# === Load features ===
df = pd.read_csv(FEATURES_PATH)

# === Load model, scaler, and encoder ===
model = joblib.load(MODEL_PATH)
scaler = joblib.load(SCALER_PATH)
le = joblib.load(ENCODER_PATH)

# === Identify feature columns (exclude metadata) ===
exclude_cols = {'subject', 'file_name', 'audio_path'}
feature_cols = [c for c in df.columns if c not in exclude_cols]

# === Prepare features ===
X = df[feature_cols].values
X_scaled = scaler.transform(X)

# === Predict ===
y_pred = model.predict(X_scaled)
y_pred_labels = le.inverse_transform(y_pred)

# === Save predictions ===
results_df = df[['subject', 'file_name', 'audio_path']].copy()
results_df['predicted_pain_level'] = y_pred_labels
results_df.to_csv(OUTPUT_PATH, index=False)
print(f"Predictions saved to {OUTPUT_PATH}")