In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -----------------------------------------------------------
# 1. Load TRAIN and TEST datasets
# -----------------------------------------------------------
train_path = "Classification_Combined_Data/S1_S2_train_data.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data.csv"

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

# -----------------------------------------------------------
# 2. Apply label mapping to both
# -----------------------------------------------------------
label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# -----------------------------------------------------------
# 3. Encode target labels (alert=0, drowsy=1)
# -----------------------------------------------------------
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["MappedLabel"])
y_test  = label_encoder.transform(df_test["MappedLabel"])

# -----------------------------------------------------------
# 4. Select numeric features
# -----------------------------------------------------------
exclude_cols = ["Label", "MappedLabel", "ID", "Study"]
feature_cols = [c for c in df_train.columns if c not in exclude_cols]

X_train = df_train[feature_cols]
X_test  = df_test[feature_cols]

# -----------------------------------------------------------
# 5. Scale features (fit on train, transform on test)
# -----------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# -----------------------------------------------------------
# 6. Fit unsupervised 2-component GMM on TRAIN DATA ONLY
# -----------------------------------------------------------
gmm = GaussianMixture(
    n_components=2,
    covariance_type='full',
    random_state=42
)
gmm.fit(X_train_scaled)

# -----------------------------------------------------------
# 7. Predict cluster labels on TRAIN (for alignment)
# -----------------------------------------------------------
train_cluster_labels = gmm.predict(X_train_scaled)

# -----------------------------------------------------------
# 8. Align cluster IDs to true labels using TRAIN accuracy
# -----------------------------------------------------------
acc0 = accuracy_score(y_train, train_cluster_labels)
acc1 = accuracy_score(y_train, 1 - train_cluster_labels)

# cluster → label mapping
if acc1 > acc0:
    cluster_to_label = lambda c: 1 - c
else:
    cluster_to_label = lambda c: c

# -----------------------------------------------------------
# 9. Predict on TEST
# -----------------------------------------------------------
test_clusters = gmm.predict(X_test_scaled)
test_preds = cluster_to_label(test_clusters)

# -----------------------------------------------------------
# 10. Evaluate TEST accuracy
# -----------------------------------------------------------
print("=== TEST SET RESULTS ===")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_preds))
print("\nClassification Report:")
print(classification_report(y_test, test_preds, target_names=label_encoder.classes_))

# -----------------------------------------------------------
# 11. Posterior probabilities on TEST set
# -----------------------------------------------------------
probs_test = gmm.predict_proba(X_test_scaled)

df_test["GMM_prob_alert"] = probs_test[:, 0]
df_test["GMM_prob_drowsy"] = probs_test[:, 1]
df_test["GMM_pred_cluster"] = test_clusters
df_test["GMM_pred_label"] = label_encoder.inverse_transform(test_preds)

df_test.head()

=== TEST SET RESULTS ===
Accuracy: 0.5046683046683047

Confusion Matrix:
[[372 163]
 [845 655]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.31      0.70      0.42       535
      drowsy       0.80      0.44      0.57      1500

    accuracy                           0.50      2035
   macro avg       0.55      0.57      0.49      2035
weighted avg       0.67      0.50      0.53      2035



Unnamed: 0,window_start,ID,Study,Label,EAR_mean_mean,MAR_inner_mean,MAR_outer_mean,AU01_r_mean,AU15_r_mean,AU25_r_mean,...,gaze_angle_y_std,swAngle_std,laneDevPosition_std,laneDev_OffsetfrmLaneCentre_std,speed_std,MappedLabel,GMM_prob_alert,GMM_prob_drowsy,GMM_pred_cluster,GMM_pred_label
0,1638561000.0,10.0,S1,Not Drowsy,0.280226,0.020549,0.303724,0.077756,0.133311,0.223478,...,0.038516,1.446996,0.0,0.697119,2.509008,alert,1.82497e-11,1.0,1,drowsy
1,1638561000.0,10.0,S1,Not Drowsy,0.275627,0.016681,0.298697,0.135278,0.115778,0.293422,...,0.049447,1.021389,0.0,1.1149,3.21946,alert,1.071764e-08,1.0,1,drowsy
2,1638561000.0,10.0,S1,Not Drowsy,0.277547,0.013587,0.298186,0.104289,0.105111,0.266167,...,0.045153,1.907755,0.0,1.670019,3.594871,alert,1.096096e-06,0.999999,1,drowsy
3,1638561000.0,10.0,S1,Not Drowsy,0.283759,0.012794,0.297106,0.075489,0.132756,0.258267,...,0.031922,1.634922,0.0,1.563995,2.562208,alert,1.181323e-05,0.999988,1,drowsy
4,1638561000.0,10.0,S1,Not Drowsy,0.2844,0.010559,0.292257,0.086489,0.105122,0.274722,...,0.03304,0.698894,0.0,0.817669,3.651178,alert,7.423996e-05,0.999926,1,drowsy


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# ===========================================
# 1. LOAD TRAIN + TEST DATA
# ===========================================
train_path = "Classification_Combined_Data/S1_S2_train_data.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data.csv"

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

# ===========================================
# 2. APPLY LABEL MAPPING
# ===========================================
label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# ===========================================
# 3. ENCODE LABELS (alert=0, drowsy=1)
# ===========================================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["MappedLabel"])
y_test  = label_encoder.transform(df_test["MappedLabel"])

# ===========================================
# 4. SELECT NUMERIC FEATURE COLUMNS
# ===========================================
exclude_cols = ["Label", "MappedLabel", "ID", "Study"]
feature_cols = [col for col in df_train.columns if col not in exclude_cols]

X_train = df_train[feature_cols]
X_test  = df_test[feature_cols]

# ===========================================
# 5. STANDARDIZE FEATURES (fit on train ONLY)
# ===========================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ===========================================
# 6. SMOTE on the TRAIN SET only
# ===========================================
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_scaled, y_train)

print("Training balance after SMOTE:")
print(pd.Series(y_train_bal).value_counts())

# ===========================================
# 7. TRAIN SEPARATE GMMs (SUPERVISED)
#    One GMM for each class
# ===========================================
X_train_alert   = X_train_bal[y_train_bal == 0]
X_train_drowsy  = X_train_bal[y_train_bal == 1]

gmm_alert = GaussianMixture(
    n_components=2,
    covariance_type='full',
    random_state=42
)
gmm_drowsy = GaussianMixture(
    n_components=2,
    covariance_type='full',
    random_state=42
)

gmm_alert.fit(X_train_alert)
gmm_drowsy.fit(X_train_drowsy)

# ===========================================
# 8. CLASSIFICATION USING BAYES RULE
#    p(x | class) * P(class)
# ===========================================
# class priors from balanced training set
prior_alert  = (y_train_bal == 0).mean()
prior_drowsy = (y_train_bal == 1).mean()

# likelihoods from GMM
log_lik_alert  = gmm_alert.score_samples(X_test_scaled)
log_lik_drowsy = gmm_drowsy.score_samples(X_test_scaled)

# convert log-likelihoods + priors to posterior probabilities
log_posterior_alert  = log_lik_alert  + np.log(prior_alert)
log_posterior_drowsy = log_lik_drowsy + np.log(prior_drowsy)

# prediction: choose class with larger posterior
y_pred = np.where(log_posterior_alert > log_posterior_drowsy, 0, 1)

# ===========================================
# 9. EVALUATION
# ===========================================
print("\n=== TEST SET RESULTS ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ===========================================
# 10. SAVE PROBABILITIES & PREDICTIONS
# ===========================================
# convert log posterior to normalized probabilities
posterior_alert = np.exp(log_posterior_alert)
posterior_drowsy = np.exp(log_posterior_drowsy)
posterior_sum = posterior_alert + posterior_drowsy

df_test["GMM_prob_alert"] = posterior_alert / posterior_sum
df_test["GMM_prob_drowsy"] = posterior_drowsy / posterior_sum
df_test["GMM_pred"] = y_pred
df_test["GMM_pred_label"] = label_encoder.inverse_transform(y_pred)

df_test.head()

Training balance after SMOTE:
0    7316
1    7316
Name: count, dtype: int64

=== TEST SET RESULTS ===
Accuracy: 0.7562653562653563

Confusion Matrix:
[[  89  446]
 [  50 1450]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.64      0.17      0.26       535
      drowsy       0.76      0.97      0.85      1500

    accuracy                           0.76      2035
   macro avg       0.70      0.57      0.56      2035
weighted avg       0.73      0.76      0.70      2035



Unnamed: 0,window_start,ID,Study,Label,EAR_mean_mean,MAR_inner_mean,MAR_outer_mean,AU01_r_mean,AU15_r_mean,AU25_r_mean,...,gaze_angle_y_std,swAngle_std,laneDevPosition_std,laneDev_OffsetfrmLaneCentre_std,speed_std,MappedLabel,GMM_prob_alert,GMM_prob_drowsy,GMM_pred,GMM_pred_label
0,1638561000.0,10.0,S1,Not Drowsy,0.280226,0.020549,0.303724,0.077756,0.133311,0.223478,...,0.038516,1.446996,0.0,0.697119,2.509008,alert,2.483629e-08,1.0,1,drowsy
1,1638561000.0,10.0,S1,Not Drowsy,0.275627,0.016681,0.298697,0.135278,0.115778,0.293422,...,0.049447,1.021389,0.0,1.1149,3.21946,alert,1.594678e-08,1.0,1,drowsy
2,1638561000.0,10.0,S1,Not Drowsy,0.277547,0.013587,0.298186,0.104289,0.105111,0.266167,...,0.045153,1.907755,0.0,1.670019,3.594871,alert,3.57702e-13,1.0,1,drowsy
3,1638561000.0,10.0,S1,Not Drowsy,0.283759,0.012794,0.297106,0.075489,0.132756,0.258267,...,0.031922,1.634922,0.0,1.563995,2.562208,alert,4.691436e-12,1.0,1,drowsy
4,1638561000.0,10.0,S1,Not Drowsy,0.2844,0.010559,0.292257,0.086489,0.105122,0.274722,...,0.03304,0.698894,0.0,0.817669,3.651178,alert,2.186152e-05,0.999978,1,drowsy
