In [12]:
import numpy as np
import pandas as pd
import os
import glob
from scipy.io import loadmat
from scipy.signal import butter, filtfilt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
import matplotlib.pyplot as plt
import seaborn as sns

# Data directory and parameters
DATA_ROOT = r'D:\bio_s'
PAIN_LEVELS = {'low_pain': 'PL_1', 'med_pain': 'PL_2'}
EMG_LABELS = ['corrugator', 'zygomaticus']
EMG_COLS = [0, 1]  # Use only first two (corrugator, zygomaticus)
BANDPASS = (20, 450)
FS = 1000  # Sampling rate
ARTIFACT_THRESHOLD = 1000  # uV, adjust as needed

In [13]:
# Step 2: Helper Functions
# ------------------------

from scipy.signal import butter, filtfilt
import numpy as np

def butter_bandpass(lowcut, highcut, fs, order=4):
    """
    Designs a Butterworth bandpass filter.
    """
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def apply_bandpass(data, fs, lowcut=20, highcut=450):
    """
    Applies a bandpass filter to the data.
    """
    b, a = butter_bandpass(lowcut, highcut, fs)
    return filtfilt(b, a, data)

def artifact_rejection(emg_signal, threshold=1000):
    """
    Returns True if the signal contains amplitudes above the artifact threshold.
    """
    return np.any(np.abs(emg_signal) > threshold)

def extract_features(emg_signal, fs):
    """
    Extracts time and frequency domain features from an EMG signal segment.
    """
    feats = {}
    feats['mean'] = np.mean(emg_signal)
    feats['std'] = np.std(emg_signal)
    feats['rms'] = np.sqrt(np.mean(emg_signal ** 2))
    feats['max'] = np.max(emg_signal)
    feats['min'] = np.min(emg_signal)
    feats['median'] = np.median(emg_signal)
    feats['abs_mean'] = np.mean(np.abs(emg_signal))
    feats['zero_cross'] = ((emg_signal[:-1] * emg_signal[1:]) < 0).sum()
    feats['waveform_length'] = np.sum(np.abs(np.diff(emg_signal)))
    # Frequency-domain features
    fft_vals = np.abs(np.fft.rfft(emg_signal))
    fft_freq = np.fft.rfftfreq(len(emg_signal), 1/fs)
    feats['mean_freq'] = np.sum(fft_freq * fft_vals) / np.sum(fft_vals)
    median_freq_idx = np.where(np.cumsum(fft_vals) >= np.sum(fft_vals)/2)[0][0]
    feats['median_freq'] = fft_freq[median_freq_idx]
    return feats

In [14]:
# Step 3: Data Loading & Feature Table Construction
# ------------------------------------------------

import os
import glob
from scipy.io import loadmat
import pandas as pd

def process_subject_segments(subject_folder, pain_label, artifact_threshold=ARTIFACT_THRESHOLD):
    """
    For a given subject and pain level, process all .mat files:
      - Bandpass filter both EMG channels
      - Reject segments with artifacts
      - Extract features for both channels
      - Return list of feature dicts (one per segment)
    """
    subject_id = os.path.basename(subject_folder)
    feature_rows = []
    for mat_path in glob.glob(os.path.join(subject_folder, '*.mat')):
        try:
            mat = loadmat(mat_path)
            data = mat['data']  # shape (n_samples, 5)
            row = {
                'subject': subject_id,
                'segment_file': os.path.basename(mat_path),
                'pain': pain_label
            }
            artifacted = False
            for i, ch in enumerate(EMG_COLS):
                emg = data[:, ch]
                emg_filt = apply_bandpass(emg, FS, BANDPASS[0], BANDPASS[1])
                if artifact_rejection(emg_filt, artifact_threshold):
                    artifacted = True
                    break
                feats = extract_features(emg_filt, FS)
                for k, v in feats.items():
                    row[f'{EMG_LABELS[i]}_{k}'] = v
            if not artifacted:
                feature_rows.append(row)
        except Exception as e:
            print(f"Error processing {mat_path}: {e}")
    return feature_rows

# Aggregate for all subjects and both pain levels
all_features = []
for pain_folder, label in PAIN_LEVELS.items():
    folder_path = os.path.join(DATA_ROOT, pain_folder)
    for subj in sorted(os.listdir(folder_path)):
        subj_folder = os.path.join(folder_path, subj)
        if os.path.isdir(subj_folder):
            print(f"Processing {subj_folder}...")
            feat_rows = process_subject_segments(subj_folder, label)
            all_features.extend(feat_rows)

# Build DataFrame
df_features = pd.DataFrame(all_features)
print("Feature DataFrame shape:", df_features.shape)
df_features.head()

Processing D:\bio_s\low_pain\S008...
Processing D:\bio_s\low_pain\S009...
Processing D:\bio_s\low_pain\S012...
Processing D:\bio_s\low_pain\S019...
Processing D:\bio_s\low_pain\S026...
Processing D:\bio_s\low_pain\S034...
Processing D:\bio_s\low_pain\S036...
Processing D:\bio_s\low_pain\S045...
Processing D:\bio_s\low_pain\S049...
Processing D:\bio_s\low_pain\S051...
Processing D:\bio_s\low_pain\S052...
Processing D:\bio_s\low_pain\S057...
Processing D:\bio_s\low_pain\S068...
Processing D:\bio_s\low_pain\S069...
Processing D:\bio_s\low_pain\S070...
Processing D:\bio_s\low_pain\S072...
Processing D:\bio_s\low_pain\S076...
Processing D:\bio_s\low_pain\S082...
Processing D:\bio_s\low_pain\S087...
Processing D:\bio_s\low_pain\S095...
Processing D:\bio_s\low_pain\S107...
Processing D:\bio_s\low_pain\S110...
Processing D:\bio_s\low_pain\S114...
Processing D:\bio_s\low_pain\S126...
Processing D:\bio_s\low_pain\S129...
Processing D:\bio_s\low_pain\S134...
Processing D:\bio_s\med_pain\S008...
P

Unnamed: 0,subject,segment_file,pain,corrugator_mean,corrugator_std,corrugator_rms,corrugator_max,corrugator_min,corrugator_median,corrugator_abs_mean,...,zygomaticus_std,zygomaticus_rms,zygomaticus_max,zygomaticus_min,zygomaticus_median,zygomaticus_abs_mean,zygomaticus_zero_cross,zygomaticus_waveform_length,zygomaticus_mean_freq,zygomaticus_median_freq
0,S008,S008_0.mat,PL_1,-1.262862e-05,0.012943,0.012943,0.070712,-0.096127,0.000888,0.008459,...,0.004881,0.004881,0.033626,-0.034432,0.000245,0.002999,3284,26.311347,162.596924,139.3
1,S008,S008_10.mat,PL_1,3.296219e-07,0.001211,0.001211,0.006373,-0.007387,-4.4e-05,0.000933,...,0.009383,0.009383,0.064049,-0.058743,-0.000193,0.006405,2749,49.457604,159.198364,126.7
2,S008,S008_100.mat,PL_1,-3.065771e-06,0.005499,0.005499,0.028406,-0.032078,0.00036,0.003943,...,0.001688,0.001688,0.011885,-0.008169,-5.1e-05,0.001242,2740,9.698282,162.496185,127.9
3,S008,S008_102.mat,PL_1,-1.011531e-05,0.004827,0.004827,0.023616,-0.02128,4.3e-05,0.003596,...,0.001527,0.001527,0.02073,-0.013734,-2.6e-05,0.001134,2558,8.506985,164.467671,128.8
4,S008,S008_103.mat,PL_1,-1.658919e-06,0.004719,0.004719,0.06827,-0.089671,2e-05,0.002037,...,0.00646,0.00646,0.100476,-0.086019,-6e-05,0.003055,3162,25.39838,182.650932,165.8


In [15]:
import numpy as np
from scipy.signal import butter, filtfilt

def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def apply_bandpass(data, fs, lowcut=20, highcut=450, order=4):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    return filtfilt(b, a, data)

def artifact_rejection(emg_signal, threshold=1000):
    """
    Returns True if the signal contains amplitudes above the artifact threshold.
    """
    return np.any(np.abs(emg_signal) > threshold)

def extract_features(emg_signal, fs):
    """
    Extracts time and frequency domain features from an EMG signal segment.
    Returns: dict of feature names and values.
    """
    feats = {}
    feats['mean'] = np.mean(emg_signal)
    feats['std'] = np.std(emg_signal)
    feats['rms'] = np.sqrt(np.mean(emg_signal ** 2))
    feats['max'] = np.max(emg_signal)
    feats['min'] = np.min(emg_signal)
    feats['median'] = np.median(emg_signal)
    feats['abs_mean'] = np.mean(np.abs(emg_signal))
    feats['zero_cross'] = int(((emg_signal[:-1] * emg_signal[1:]) < 0).sum())
    feats['waveform_length'] = np.sum(np.abs(np.diff(emg_signal)))
    # Frequency-domain features
    fft_vals = np.abs(np.fft.rfft(emg_signal))
    fft_freq = np.fft.rfftfreq(len(emg_signal), 1/fs)
    if np.sum(fft_vals) > 0:
        feats['mean_freq'] = np.sum(fft_freq * fft_vals) / np.sum(fft_vals)
        median_freq_idx = np.where(np.cumsum(fft_vals) >= np.sum(fft_vals)/2)[0][0]
        feats['median_freq'] = fft_freq[median_freq_idx]
    else:
        feats['mean_freq'] = 0.0
        feats['median_freq'] = 0.0
    return feats

def extract_features_from_mat_file(mat, fs, artifact_threshold=1000, emg_cols=[0,1], emg_labels=['corrugator', 'zygomaticus']):
    """
    Given a loaded .mat file dict, returns features for both EMG channels.
    Returns None if artifact found in any channel.
    """
    data = mat['data']
    features = {}
    for i, ch in enumerate(emg_cols):
        emg = data[:, ch]
        emg_filt = apply_bandpass(emg, fs)
        if artifact_rejection(emg_filt, artifact_threshold):
            return None  # artifact: skip this segment
        feats = extract_features(emg_filt, fs)
        for k, v in feats.items():
            features[f"{emg_labels[i]}_{k}"] = v
    return features

# Usage example (requires scipy.io.loadmat and a loaded .mat file):
# from scipy.io import loadmat
# mat = loadmat('bio_s/low_pain/S001/S001_0.mat')
# feats = extract_features_from_mat_file(mat, fs=1000)
# print(feats)

In [16]:
import os
import glob
import pandas as pd
from scipy.io import loadmat

# Make sure to import all functions from feature_extraction.py here or paste them above this code:
# - apply_bandpass
# - artifact_rejection
# - extract_features

def extract_features_from_dataset(
    data_root,
    pain_levels,
    emg_cols,
    emg_labels,
    fs=1000,
    bandpass=(20, 450),
    artifact_threshold=1000
):
    """
    Loops through all subjects, segments, and pain levels in the dataset
    and extracts features for each valid segment.

    Returns a DataFrame with one row per segment.
    """
    all_rows = []
    for pain_folder, pain_label in pain_levels.items():
        folder_path = os.path.join(data_root, pain_folder)
        if not os.path.isdir(folder_path):
            continue
        for subj in sorted(os.listdir(folder_path)):
            subj_folder = os.path.join(folder_path, subj)
            if not os.path.isdir(subj_folder):
                continue
            for mat_path in glob.glob(os.path.join(subj_folder, '*.mat')):
                try:
                    mat = loadmat(mat_path)
                    row = {
                        'subject': subj,
                        'segment_file': os.path.basename(mat_path),
                        'pain': pain_label
                    }
                    artifacted = False
                    for i, ch in enumerate(emg_cols):
                        emg = mat['data'][:, ch]
                        emg_filt = apply_bandpass(emg, fs, bandpass[0], bandpass[1])
                        if artifact_rejection(emg_filt, artifact_threshold):
                            artifacted = True
                            break
                        feats = extract_features(emg_filt, fs)
                        for k, v in feats.items():
                            row[f"{emg_labels[i]}_{k}"] = v
                    if not artifacted:
                        all_rows.append(row)
                except Exception as e:
                    print(f"Error processing {mat_path}: {e}")

    df = pd.DataFrame(all_rows)
    return df

# Usage example:
# df_features = extract_features_from_dataset(
#     DATA_ROOT, PAIN_LEVELS, EMG_COLS, EMG_LABELS, FS, BANDPASS, ARTIFACT_THRESHOLD
# )
# print(df_features.shape)
# df_features.head()

In [17]:
import os
import glob
import pandas as pd
from scipy.io import loadmat

# Make sure to import all functions from feature_extraction.py here or paste them above this code:
# - apply_bandpass
# - artifact_rejection
# - extract_features

def extract_features_from_dataset(
    data_root,
    pain_levels,
    emg_cols,
    emg_labels,
    fs=1000,
    bandpass=(20, 450),
    artifact_threshold=1000
):
    """
    Loops through all subjects, segments, and pain levels in the dataset
    and extracts features for each valid segment.

    Returns a DataFrame with one row per segment.
    """
    all_rows = []
    for pain_folder, pain_label in pain_levels.items():
        folder_path = os.path.join(data_root, pain_folder)
        if not os.path.isdir(folder_path):
            continue
        for subj in sorted(os.listdir(folder_path)):
            subj_folder = os.path.join(folder_path, subj)
            if not os.path.isdir(subj_folder):
                continue
            for mat_path in glob.glob(os.path.join(subj_folder, '*.mat')):
                try:
                    mat = loadmat(mat_path)
                    row = {
                        'subject': subj,
                        'segment_file': os.path.basename(mat_path),
                        'pain': pain_label
                    }
                    artifacted = False
                    for i, ch in enumerate(emg_cols):
                        emg = mat['data'][:, ch]
                        emg_filt = apply_bandpass(emg, fs, bandpass[0], bandpass[1])
                        if artifact_rejection(emg_filt, artifact_threshold):
                            artifacted = True
                            break
                        feats = extract_features(emg_filt, fs)
                        for k, v in feats.items():
                            row[f"{emg_labels[i]}_{k}"] = v
                    if not artifacted:
                        all_rows.append(row)
                except Exception as e:
                    print(f"Error processing {mat_path}: {e}")

    df = pd.DataFrame(all_rows)
    return df

# Usage example:
# df_features = extract_features_from_dataset(
#     DATA_ROOT, PAIN_LEVELS, EMG_COLS, EMG_LABELS, FS, BANDPASS, ARTIFACT_THRESHOLD
# )
# print(df_features.shape)
# df_features.head()

In [18]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV

def run_leave_one_subject_out_rf(df_features, emg_labels=['corrugator', 'zygomaticus']):
    """
    Runs Random Forest with leave-one-subject-out cross-validation.

    Returns:
        preds: predicted labels
        probs: predicted probabilities
        y_true: true labels
        groups: subject IDs
    """
    # Select features
    feature_cols = [c for c in df_features.columns if any(e in c for e in emg_labels)]
    X = df_features[feature_cols].values
    y = (df_features['pain'] == 'PL_2').astype(int).values  # 1 for medium pain, 0 for low pain
    groups = df_features['subject'].values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Leave-One-Group-Out (subject as group)
    logo = LeaveOneGroupOut()
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    cal_rf = CalibratedClassifierCV(rf, method='isotonic', cv=3)

    preds = np.zeros(len(y))
    probs = np.zeros(len(y))
    for fold, (train_idx, test_idx) in enumerate(logo.split(X_scaled, y, groups)):
        cal_rf.fit(X_scaled[train_idx], y[train_idx])
        preds[test_idx] = cal_rf.predict(X_scaled[test_idx])
        probs[test_idx] = cal_rf.predict_proba(X_scaled[test_idx])[:,1]
        print(f"Fold {fold+1}: Subject {groups[test_idx][0]} - Acc: {accuracy_score(y[test_idx], preds[test_idx]):.3f}")

    print("Overall Accuracy:", accuracy_score(y, preds))
    print("ROC-AUC:", roc_auc_score(y, probs))
    print("Confusion Matrix:\n", confusion_matrix(y, preds))

    return preds, probs, y, groups

# Example usage:
# preds, probs, y_true, groups = run_leave_one_subject_out_rf(df_features)