In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -----------------------------------------------------------
# 1. Load TRAIN and TEST datasets
# -----------------------------------------------------------
train_path = "Classification_Combined_Data/S1_S2_train_data.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data.csv"

df_train1 = pd.read_csv(train_path)
#df_test  = pd.read_csv(test_path)

# train only on ID = 5.0 fist 80%
df_train = df_train1[df_train1["ID"] == 8.0].sample(frac=0.8, random_state=42)
#test is last 20% of train
df_test = df_train1[df_train1["ID"] == 8.0].drop(df_train.index)

# # train only on ID = 5.0 fist 80%
# df_train = df_train1[df_train1["ID"] == 11.0].sample(frac=0.8, random_state=42)
# #test is last 20% of train
# df_test = df_train1[df_train1["ID"] == 11.0].drop(df_train.index)

#for both train and test, only rows where labsl is Not Drowsy or Slight
df_train = df_train[df_train["Label"].isin(["Not Drowsy", "Slight", "Moderate", "Very"])]
df_test = df_test[df_test["Label"].isin(["Not Drowsy", "Slight", "Moderate", "Very"])]
# -----------------------------------------------------------
# 2. Apply label mapping to both
# -----------------------------------------------------------
label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# -----------------------------------------------------------
# 3. Encode target labels (alert=0, drowsy=1)
# -----------------------------------------------------------
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["MappedLabel"])
y_test  = label_encoder.transform(df_test["MappedLabel"])

# -----------------------------------------------------------
# 4. Select numeric features
# -----------------------------------------------------------
exclude_cols = ["Label", "MappedLabel", "ID", "Study"]
feature_cols = [c for c in df_train.columns if c not in exclude_cols]

X_train = df_train[feature_cols]
X_test  = df_test[feature_cols]

# -----------------------------------------------------------
# 5. Scale features (fit on train, transform on test)
# -----------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# -----------------------------------------------------------
# 6. Fit unsupervised 2-component GMM on TRAIN DATA ONLY
# -----------------------------------------------------------
gmm = GaussianMixture(
    n_components=2,
    covariance_type='full',
    random_state=42
)
gmm.fit(X_train_scaled)

# -----------------------------------------------------------
# 7. Predict cluster labels on TRAIN (for alignment)
# -----------------------------------------------------------
train_cluster_labels = gmm.predict(X_train_scaled)

# -----------------------------------------------------------
# 8. Align cluster IDs to true labels using TRAIN accuracy
# -----------------------------------------------------------
acc0 = accuracy_score(y_train, train_cluster_labels)
acc1 = accuracy_score(y_train, 1 - train_cluster_labels)

# cluster → label mapping
if acc1 > acc0:
    cluster_to_label = lambda c: 1 - c
else:
    cluster_to_label = lambda c: c

# -----------------------------------------------------------
# 9. Predict on TEST
# -----------------------------------------------------------
test_clusters = gmm.predict(X_test_scaled)
test_preds = cluster_to_label(test_clusters)

# -----------------------------------------------------------
# 10. Evaluate TEST accuracy
# -----------------------------------------------------------
print("=== TEST SET RESULTS ===")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_preds))
print("\nClassification Report:")
print(classification_report(y_test, test_preds, target_names=label_encoder.classes_))

# -----------------------------------------------------------
# 11. Posterior probabilities on TEST set
# -----------------------------------------------------------
epsilon = 0.005 # to avoid exact 0 or 1 probabilities
probs_test = gmm.predict_proba(X_test_scaled)
# probs_test = np.clip(probs_test, epsilon, 1 - epsilon)
# probs_test = gmm.predict_proba(X_test_scaled)

df_test["GMM_prob_alert"] = probs_test[:, 0]
df_test["GMM_prob_drowsy"] = probs_test[:, 1]
df_test["GMM_pred_cluster"] = test_clusters
df_test["GMM_pred_label"] = label_encoder.inverse_transform(test_preds)

df_test.head(n=50)

=== TEST SET RESULTS ===
Accuracy: 0.6857142857142857

Confusion Matrix:
[[ 9  0]
 [22 39]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.29      1.00      0.45         9
      drowsy       1.00      0.64      0.78        61

    accuracy                           0.69        70
   macro avg       0.65      0.82      0.61        70
weighted avg       0.91      0.69      0.74        70



Unnamed: 0,window_start,ID,Study,Label,EAR_mean_mean,MAR_inner_mean,MAR_outer_mean,AU01_r_mean,AU15_r_mean,AU25_r_mean,...,gaze_angle_y_std,swAngle_std,laneDevPosition_std,laneDev_OffsetfrmLaneCentre_std,speed_std,MappedLabel,GMM_prob_alert,GMM_prob_drowsy,GMM_pred_cluster,GMM_pred_label
8192,1638303000.0,8.0,S1,Not Drowsy,0.266397,0.011886,0.292544,0.110267,0.097711,0.223156,...,0.026392,0.974891,0.0,0.661129,0.563935,alert,1.0,7.532607e-24,0,alert
8204,1638303000.0,8.0,S1,Not Drowsy,0.266797,0.015164,0.303821,0.071756,0.105878,0.253522,...,0.036921,0.589138,1.208826,0.491517,1.510266,alert,1.0,1.108593e-20,0,alert
8211,1638303000.0,8.0,S1,Not Drowsy,0.270684,0.019598,0.299074,0.016667,0.003444,0.125967,...,0.022225,0.66887,0.0,0.996734,0.093211,alert,1.0,8.565484e-21,0,alert
8212,1638303000.0,8.0,S1,Not Drowsy,0.267413,0.022363,0.301796,0.033222,0.007689,0.141178,...,0.024176,1.045369,0.0,0.785328,0.113196,alert,1.0,8.62674e-15,0,alert
8225,1638303000.0,8.0,S1,Not Drowsy,0.267375,0.015742,0.290849,0.003122,0.038733,0.100367,...,0.032088,0.415481,0.0,0.480843,1.001029,alert,1.0,4.608877e-32,0,alert
8234,1638303000.0,8.0,S1,Not Drowsy,0.265671,0.01604,0.296774,0.007922,0.033433,0.044911,...,0.036918,1.653659,0.0,0.923296,0.878322,alert,1.0,1.0275379999999999e-19,0,alert
8239,1638303000.0,8.0,S1,Not Drowsy,0.264889,0.018619,0.285581,0.029222,0.036878,0.134756,...,0.031547,2.635597,0.0,1.033407,0.157191,alert,0.9999999,9.663673e-08,0,alert
8240,1638303000.0,8.0,S1,Not Drowsy,0.263228,0.023281,0.297947,0.011211,0.021889,0.157578,...,0.032007,1.808569,0.0,0.639711,0.089769,alert,1.0,3.23028e-09,0,alert
8241,1638303000.0,8.0,S1,Not Drowsy,0.268577,0.022861,0.307031,0.019856,0.017378,0.278133,...,0.032071,0.874246,0.0,0.501827,0.290161,alert,0.9999906,9.351701e-06,0,alert
8243,1638303000.0,8.0,S1,Slight,0.259239,0.025918,0.302042,0.048833,0.092789,0.200856,...,0.038108,0.901499,0.0,0.450414,0.201679,drowsy,1.0,2.421475e-13,0,alert


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# ===========================================
# 1. LOAD TRAIN + TEST DATA
# ===========================================
train_path = "Classification_Combined_Data/S1_S2_train_data.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data.csv"

df_train1 = pd.read_csv(train_path)
#df_test  = pd.read_csv(test_path)

# # train only on ID = 5.0 fist 80%
# df_train = df_train1[df_train1["ID"] == 11.0].sample(frac=0.8, random_state=42)
# #test is last 20% of train
# df_test = df_train1[df_train1["ID"] ==11.0].drop(df_train.index)

# train only on ID = 5.0 fist 80%
df_train = df_train1[df_train1["ID"] == 8.0].sample(frac=0.8, random_state=42)
#test is last 20% of train
df_test = df_train1[df_train1["ID"] == 8.0].drop(df_train.index)

#for both train and test, only rows where labsl is Not Drowsy or Slight
df_train = df_train[df_train["Label"].isin(["Not Drowsy", "Slight", "Moderate", "Very"])]
df_test = df_test[df_test["Label"].isin(["Not Drowsy", "Slight", "Moderate", "Very"])]

# ===========================================
# 2. APPLY LABEL MAPPING
# ===========================================
label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# ===========================================
# 3. ENCODE LABELS (alert=0, drowsy=1)
# ===========================================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["MappedLabel"])
y_test  = label_encoder.transform(df_test["MappedLabel"])

# ===========================================
# 4. SELECT NUMERIC FEATURE COLUMNS
# ===========================================
exclude_cols = ["Label", "MappedLabel", "ID", "Study"]
feature_cols = [col for col in df_train.columns if col not in exclude_cols]

X_train = df_train[feature_cols]
X_test  = df_test[feature_cols]

# ===========================================
# 5. STANDARDIZE FEATURES (fit on train ONLY)
# ===========================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# ===========================================
# 6. SMOTE on the TRAIN SET only
# ===========================================
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_scaled, y_train)

print("Training balance after SMOTE:")
print(pd.Series(y_train_bal).value_counts())

# ===========================================
# 7. TRAIN SEPARATE GMMs (SUPERVISED)
#    One GMM for each class
# ===========================================
X_train_alert   = X_train_bal[y_train_bal == 0]
X_train_drowsy  = X_train_bal[y_train_bal == 1]

gmm_alert = GaussianMixture(
    n_components=2,
    covariance_type='full',
    random_state=42
)
gmm_drowsy = GaussianMixture(
    n_components=2,
    covariance_type='full',
    random_state=42
)

gmm_alert.fit(X_train_alert)
gmm_drowsy.fit(X_train_drowsy)

# ===========================================
# 8. CLASSIFICATION USING BAYES RULE
#    p(x | class) * P(class)
# ===========================================
# class priors from balanced training set
prior_alert  = (y_train_bal == 0).mean()
prior_drowsy = (y_train_bal == 1).mean()

# likelihoods from GMM
log_lik_alert  = gmm_alert.score_samples(X_test_scaled)
log_lik_drowsy = gmm_drowsy.score_samples(X_test_scaled)

# convert log-likelihoods + priors to posterior probabilities
log_posterior_alert  = log_lik_alert  + np.log(prior_alert)
log_posterior_drowsy = log_lik_drowsy + np.log(prior_drowsy)

# prediction: choose class with larger posterior
y_pred = np.where(log_posterior_alert > log_posterior_drowsy, 0, 1)

# ===========================================
# 9. EVALUATION
# ===========================================
print("\n=== TEST SET RESULTS ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ===========================================
# 10. SAVE PROBABILITIES & PREDICTIONS
# ===========================================
# convert log posterior to normalized probabilities
posterior_alert = np.exp(log_posterior_alert)
posterior_drowsy = np.exp(log_posterior_drowsy)
posterior_sum = posterior_alert + posterior_drowsy

df_test["GMM_prob_alert"] = posterior_alert / posterior_sum
df_test["GMM_prob_drowsy"] = posterior_drowsy / posterior_sum
df_test["GMM_pred"] = y_pred
df_test["GMM_pred_label"] = label_encoder.inverse_transform(y_pred)

df_test.head()

#display confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = (model.predict(X_test) > 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Alert", "Drowsy"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

Training balance after SMOTE:
1    237
0    237
Name: count, dtype: int64

=== TEST SET RESULTS ===
Accuracy: 0.8714285714285714

Confusion Matrix:
[[ 0  9]
 [ 0 61]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.00      0.00      0.00         9
      drowsy       0.87      1.00      0.93        61

    accuracy                           0.87        70
   macro avg       0.44      0.50      0.47        70
weighted avg       0.76      0.87      0.81        70



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


NameError: name 'model' is not defined

In [None]:
# Keep classes separate for gmm
# alert vs slightly, or slightly vs moderate, or even alert vs moderate

# try changing parameters of gmm for sliding window approach
# maybe per minute change in consecutive labels
# try doing it by participant instead of all participants together
# state transitions from alert to slightly, or slightly to moderate
# See if there are newer gmm/hmm or dbscan approaches for better results