# Binary

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from scipy.optimize import linear_sum_assignment

# -----------------------------
# CONFIG
# -----------------------------
train_path = "Classification_Combined_Data/S1_S2_train_data.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data.csv"

# Grid
COMPONENT_GRID = [2, 3, 4, 6, 8, 10]
COVTYPE_GRID   = ["full", "tied", "diag", "spherical"]

RANDOM_STATE = 42

label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

# -----------------------------
# 1) Load
# -----------------------------
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

keep = ["Not Drowsy", "Slight", "Moderate", "Very"]
df_train = df_train[df_train["Label"].isin(keep)].copy()
df_test  = df_test[df_test["Label"].isin(keep)].copy()

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# -----------------------------
# 2) Encode labels (EVAL ONLY)
# -----------------------------
le = LabelEncoder()
y_train = le.fit_transform(df_train["MappedLabel"])
y_test  = le.transform(df_test["MappedLabel"])

# -----------------------------
# 3) Features
# -----------------------------
exclude_cols = ["Label", "MappedLabel", "ID", "Study", "window_start"]
feature_cols = [c for c in df_train.columns if c not in exclude_cols]

X_train = df_train[feature_cols].to_numpy()
X_test  = df_test[feature_cols].to_numpy()

# -----------------------------
# 4) Scale (fit on train only)
# -----------------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# -----------------------------
# 5) Manual grid search (rank by BIC, show BIC/AIC for each)
# -----------------------------
results = []
best = None  # (bic, aic, K, cov_type, fitted_model)

for K in COMPONENT_GRID:
    for cov_type in COVTYPE_GRID:
        try:
            gmm = GaussianMixture(
                n_components=K,
                covariance_type=cov_type,
                n_init=20,
                init_params="kmeans",
                max_iter=1000,
                tol=1e-5,
                reg_covar=1e-5,
                random_state=RANDOM_STATE
            )
            gmm.fit(X_train_s)

            bic = gmm.bic(X_train_s)
            aic = gmm.aic(X_train_s)

            results.append({"K": K, "cov_type": cov_type, "BIC": bic, "AIC": aic})

            if best is None or bic < best[0] or (bic == best[0] and aic < best[1]):
                best = (bic, aic, K, cov_type, gmm)

        except Exception as e:
            results.append({"K": K, "cov_type": cov_type, "BIC": np.nan, "AIC": np.nan, "error": str(e)})

df_results = pd.DataFrame(results).sort_values(["BIC", "AIC"], ascending=True)
print("=== GRID RESULTS (ranked by BIC then AIC) ===")
display(df_results)

# -----------------------------
# 6) Evaluate best model only (unsupervised fit + train-only mapping)
# -----------------------------
best_bic, best_aic, best_K, best_cov, best_gmm = best
print("\n=== BEST MODEL ===")
print(f"K={best_K}, covariance_type={best_cov}, BIC={best_bic:.2f}, AIC={best_aic:.2f}")

train_clusters = best_gmm.predict(X_train_s)
test_clusters  = best_gmm.predict(X_test_s)

n_labels = len(le.classes_)
counts = np.zeros((best_K, n_labels), dtype=int)
for c, y in zip(train_clusters, y_train):
    counts[c, y] += 1

# Hungarian assignment for one-to-one part
cost = counts.max() - counts
row_ind, col_ind = linear_sum_assignment(cost)
cluster_to_label = {r: c for r, c in zip(row_ind, col_ind)}

# If K > n_labels, map leftover clusters to majority label within that cluster
unassigned = set(range(best_K)) - set(cluster_to_label.keys())
for c in unassigned:
    if counts[c].sum() == 0:
        cluster_to_label[c] = int(np.bincount(y_train).argmax())
    else:
        cluster_to_label[c] = int(counts[c].argmax())

y_pred_test = np.array([cluster_to_label[c] for c in test_clusters])

print("\n--- TEST RESULTS (best model only) ---")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Macro F1:", f1_score(y_test, y_pred_test, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred_test, average="weighted"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, target_names=le.classes_))

# -----------------------------
# 7) Optional: write per-cluster posteriors for best model
# -----------------------------
probs_test = best_gmm.predict_proba(X_test_s)  # (n_test, best_K)

df_out = df_test.copy()
df_out["GMM_cluster"] = test_clusters
df_out["GMM_pred_label"] = le.inverse_transform(y_pred_test)

for k in range(best_K):
    df_out[f"GMM_prob_cluster_{k}"] = probs_test[:, k]

df_out.head(20)

=== GRID RESULTS (ranked by BIC then AIC) ===


Unnamed: 0,K,cov_type,BIC,AIC
20,10,full,347820.620585,286702.772082
16,8,full,358965.631263,310072.77232
12,6,full,380624.546353,343956.67697
8,4,full,427495.785826,403052.906003
4,3,full,453953.376295,435622.991252
22,10,diag,534294.472206,528551.140873
18,8,diag,549933.549078,545340.303871
0,2,full,567217.019027,554999.128764
14,6,diag,582528.165936,579085.006856
10,4,diag,652363.46166,650070.388706



=== BEST MODEL ===
K=10, covariance_type=full, BIC=347820.62, AIC=286702.77

--- TEST RESULTS (best model only) ---
Accuracy: 0.7331695331695331
Macro F1: 0.428355704958384
Weighted F1: 0.6263000819595678

Confusion Matrix:
[[   3  532]
 [  11 1489]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.21      0.01      0.01       535
      drowsy       0.74      0.99      0.85      1500

    accuracy                           0.73      2035
   macro avg       0.48      0.50      0.43      2035
weighted avg       0.60      0.73      0.63      2035



Unnamed: 0,window_start,ID,Study,Label,EAR_mean_mean,MAR_inner_mean,MAR_outer_mean,AU01_r_mean,AU15_r_mean,AU25_r_mean,...,GMM_prob_cluster_0,GMM_prob_cluster_1,GMM_prob_cluster_2,GMM_prob_cluster_3,GMM_prob_cluster_4,GMM_prob_cluster_5,GMM_prob_cluster_6,GMM_prob_cluster_7,GMM_prob_cluster_8,GMM_prob_cluster_9
0,1638561000.0,10.0,S1,Not Drowsy,0.280226,0.020549,0.303724,0.077756,0.133311,0.223478,...,3.5594270000000004e-22,0.999991,1.789729e-08,0.0,9.633363e-08,5.998171e-06,2.805346e-06,0.0,0.0,2.653694e-45
1,1638561000.0,10.0,S1,Not Drowsy,0.275627,0.016681,0.298697,0.135278,0.115778,0.293422,...,1.080773e-13,0.937194,0.001463136,0.0,0.05345555,3.75695e-05,0.007849603,0.0,0.0,5.385534e-40
2,1638561000.0,10.0,S1,Not Drowsy,0.277547,0.013587,0.298186,0.104289,0.105111,0.266167,...,5.317304e-15,0.040253,0.9557639,0.0,1.720303e-06,4.474707e-06,0.003976518,0.0,0.0,6.690503e-36
3,1638561000.0,10.0,S1,Not Drowsy,0.283759,0.012794,0.297106,0.075489,0.132756,0.258267,...,1.386825e-18,0.001189,0.9791121,0.0,1.176011e-05,2.468798e-06,0.01968511,0.0,0.0,4.162456e-35
4,1638561000.0,10.0,S1,Not Drowsy,0.2844,0.010559,0.292257,0.086489,0.105122,0.274722,...,1.435943e-21,2e-06,0.9815815,0.0,0.0001018605,0.0008595108,0.01745483,0.0,0.0,6.778006e-37
5,1638561000.0,10.0,S1,Not Drowsy,0.290036,0.011303,0.288014,0.146106,0.102792,0.204294,...,1.202571e-19,0.01079,0.2091111,0.0,0.000257007,0.7061261,0.07371536,0.0,0.0,9.38588e-40
6,1638561000.0,10.0,S1,Not Drowsy,0.287672,0.011005,0.288053,0.134294,0.113403,0.119028,...,9.349807e-24,0.973492,0.0002086634,0.0,2.757593e-07,0.02628286,1.665775e-05,0.0,0.0,1.722418e-43
7,1638561000.0,10.0,S1,Not Drowsy,0.280214,0.009513,0.289294,0.028344,0.069233,0.057411,...,2.6490969999999998e-24,0.049453,0.0004225029,0.0,1.320074e-09,0.8942392,0.05588522,0.0,0.0,4.641545e-40
8,1638561000.0,10.0,S1,Not Drowsy,0.278645,0.011739,0.291496,0.071678,0.139056,0.056378,...,1.704421e-20,0.995259,1.759816e-06,0.0,1.264829e-10,0.004621612,0.000117404,0.0,0.0,1.574802e-42
9,1638561000.0,10.0,S1,Not Drowsy,0.278539,0.014288,0.292226,0.074922,0.178344,0.130789,...,0.303541,0.0,0.0153715,0.0,0.0,0.0,0.6810875,0.0,0.0,1.733167e-39


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from scipy.optimize import linear_sum_assignment

# -----------------------------
# CONFIG
# -----------------------------
train_path = "Classification_Combined_Data/S1_S2_train_data.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data.csv"

COMPONENT_GRID = [2, 3, 4, 6, 8, 10]
COVTYPE_GRID   = ["full", "tied", "diag", "spherical"]
RANDOM_STATE   = 42

label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

# -----------------------------
# 1) Load
# -----------------------------
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

keep = ["Not Drowsy", "Slight", "Moderate", "Very"]
df_train = df_train[df_train["Label"].isin(keep)].copy()
df_test  = df_test[df_test["Label"].isin(keep)].copy()

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# -----------------------------
# 2) Encode labels
# -----------------------------
le = LabelEncoder()
y_train = le.fit_transform(df_train["MappedLabel"])
y_test  = le.transform(df_test["MappedLabel"])

# -----------------------------
# 3) Features
# -----------------------------
exclude_cols = ["Label", "MappedLabel", "ID", "Study", "window_start"]
feature_cols = [c for c in df_train.columns if c not in exclude_cols]

X_train = df_train[feature_cols].to_numpy()
X_test  = df_test[feature_cols].to_numpy()

# -----------------------------
# 4) Scale (fit on train only)
# -----------------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# -----------------------------
# 5) SUPERVISED "GMM": one GMM per class (generative classifier)
#    Score(x|class) + prior(class) -> choose best class
#    We'll tune: covariance_type (shared), n_components per class (same K for simplicity)
# -----------------------------
def fit_class_gmms(X, y, K, cov_type):
    class_models = {}
    class_priors = {}
    for cls in np.unique(y):
        Xc = X[y == cls]
        gmm = GaussianMixture(
            n_components=K,
            covariance_type=cov_type,
            n_init=20,
            init_params="kmeans",
            max_iter=1000,
            tol=1e-5,
            reg_covar=1e-5,
            random_state=RANDOM_STATE
        )
        gmm.fit(Xc)
        class_models[cls] = gmm
        class_priors[cls] = len(Xc) / len(X)
    return class_models, class_priors

def predict_class_gmms(X, class_models, class_priors):
    classes = sorted(class_models.keys())
    # log p(x|y=c) + log p(y=c)
    scores = np.column_stack([
        class_models[c].score_samples(X) + np.log(class_priors[c])
        for c in classes
    ])
    pred = np.array([classes[i] for i in np.argmax(scores, axis=1)])
    return pred, scores

results = []
best = None  # (metric, K, cov_type, models, priors)

for K in COMPONENT_GRID:
    for cov_type in COVTYPE_GRID:
        try:
            models, priors = fit_class_gmms(X_train_s, y_train, K, cov_type)

            # Use TRAIN AIC/BIC summed across class-models as a comparable score
            bic = sum(models[c].bic(X_train_s[y_train == c]) for c in models)
            aic = sum(models[c].aic(X_train_s[y_train == c]) for c in models)

            # Evaluate on test (since now supervised)
            y_pred_test, _ = predict_class_gmms(X_test_s, models, priors)
            macro_f1 = f1_score(y_test, y_pred_test, average="macro")

            results.append({"K": K, "cov_type": cov_type, "BIC": bic, "AIC": aic, "macro_f1_test": macro_f1})

            # Pick best by macro F1 (tie-breaker: lower BIC)
            if best is None or macro_f1 > best[0] or (macro_f1 == best[0] and bic < best[1]):
                best = (macro_f1, bic, aic, K, cov_type, models, priors)

        except Exception as e:
            results.append({"K": K, "cov_type": cov_type, "BIC": np.nan, "AIC": np.nan, "macro_f1_test": np.nan, "error": str(e)})

df_results = pd.DataFrame(results).sort_values(["macro_f1_test", "BIC"], ascending=[False, True])
print("=== GRID RESULTS (ranked by macro F1 on TEST, tie-breaker BIC) ===")
display(df_results)

# -----------------------------
# 6) Report best model only
# -----------------------------
best_f1, best_bic, best_aic, best_K, best_cov, best_models, best_priors = best
print("\n=== BEST SUPERVISED GMM CLASSIFIER ===")
print(f"K={best_K}, covariance_type={best_cov}, test macro F1={best_f1:.4f}, BIC={best_bic:.2f}, AIC={best_aic:.2f}")

y_pred_test, scores_test = predict_class_gmms(X_test_s, best_models, best_priors)

print("\n--- TEST RESULTS (best model only) ---")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Macro F1:", f1_score(y_test, y_pred_test, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred_test, average="weighted"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, target_names=le.classes_))

# Optional: class posteriors (softmax over log-scores)
probs_test = np.exp(scores_test - scores_test.max(axis=1, keepdims=True))
probs_test = probs_test / probs_test.sum(axis=1, keepdims=True)

df_out = df_test.copy()
df_out["GMM_pred_label"] = le.inverse_transform(y_pred_test)
for idx, cls in enumerate(sorted(best_models.keys())):
    df_out[f"GMM_prob_{le.inverse_transform([cls])[0]}"] = probs_test[:, idx]

df_out.head(20)

=== GRID RESULTS (ranked by macro F1 on TEST, tie-breaker BIC) ===


Unnamed: 0,K,cov_type,BIC,AIC,macro_f1_test
2,2,diag,813940.071871,811960.416981,0.616045
21,10,tied,625864.841608,610753.066085,0.601448
1,2,tied,701378.713426,690300.023641,0.597437
3,2,spherical,900880.673471,899860.106044,0.59584
0,2,full,555085.486502,533924.082373,0.594744
9,4,tied,663776.04268,651689.081461,0.591603
6,3,diag,682374.156127,679398.525797,0.587377
14,6,diag,590258.08198,584294.525326,0.584671
17,8,tied,640082.656085,625979.151997,0.580702
13,6,tied,653620.986412,640525.753758,0.571461



=== BEST SUPERVISED GMM CLASSIFIER ===
K=2, covariance_type=diag, test macro F1=0.6160, BIC=813940.07, AIC=811960.42

--- TEST RESULTS (best model only) ---
Accuracy: 0.6943488943488944
Macro F1: 0.6160451899253305
Weighted F1: 0.6982683398369074

Confusion Matrix:
[[ 247  288]
 [ 334 1166]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.43      0.46      0.44       535
      drowsy       0.80      0.78      0.79      1500

    accuracy                           0.69      2035
   macro avg       0.61      0.62      0.62      2035
weighted avg       0.70      0.69      0.70      2035



Unnamed: 0,window_start,ID,Study,Label,EAR_mean_mean,MAR_inner_mean,MAR_outer_mean,AU01_r_mean,AU15_r_mean,AU25_r_mean,...,gaze_angle_x_std,gaze_angle_y_std,swAngle_std,laneDevPosition_std,laneDev_OffsetfrmLaneCentre_std,speed_std,MappedLabel,GMM_pred_label,GMM_prob_alert,GMM_prob_drowsy
0,1638561000.0,10.0,S1,Not Drowsy,0.280226,0.020549,0.303724,0.077756,0.133311,0.223478,...,0.063595,0.038516,1.446996,0.0,0.697119,2.509008,alert,drowsy,0.00205,0.99795
1,1638561000.0,10.0,S1,Not Drowsy,0.275627,0.016681,0.298697,0.135278,0.115778,0.293422,...,0.084975,0.049447,1.021389,0.0,1.1149,3.21946,alert,drowsy,0.00227,0.99773
2,1638561000.0,10.0,S1,Not Drowsy,0.277547,0.013587,0.298186,0.104289,0.105111,0.266167,...,0.106225,0.045153,1.907755,0.0,1.670019,3.594871,alert,drowsy,0.001408,0.998592
3,1638561000.0,10.0,S1,Not Drowsy,0.283759,0.012794,0.297106,0.075489,0.132756,0.258267,...,0.114074,0.031922,1.634922,0.0,1.563995,2.562208,alert,drowsy,0.005941,0.994059
4,1638561000.0,10.0,S1,Not Drowsy,0.2844,0.010559,0.292257,0.086489,0.105122,0.274722,...,0.072349,0.03304,0.698894,0.0,0.817669,3.651178,alert,drowsy,0.040479,0.959521
5,1638561000.0,10.0,S1,Not Drowsy,0.290036,0.011303,0.288014,0.146106,0.102792,0.204294,...,0.117877,0.038245,1.068776,0.0,0.997173,0.526519,alert,drowsy,0.02116,0.97884
6,1638561000.0,10.0,S1,Not Drowsy,0.287672,0.011005,0.288053,0.134294,0.113403,0.119028,...,0.122487,0.036576,1.558166,0.0,1.377826,0.491708,alert,drowsy,0.022885,0.977115
7,1638561000.0,10.0,S1,Not Drowsy,0.280214,0.009513,0.289294,0.028344,0.069233,0.057411,...,0.015264,0.030694,1.05517,0.0,1.11817,0.644544,alert,drowsy,0.146503,0.853497
8,1638561000.0,10.0,S1,Not Drowsy,0.278645,0.011739,0.291496,0.071678,0.139056,0.056378,...,0.028549,0.04722,1.062337,0.0,0.78812,1.237312,alert,drowsy,0.011235,0.988765
9,1638561000.0,10.0,S1,Not Drowsy,0.278539,0.014288,0.292226,0.074922,0.178344,0.130789,...,0.029635,0.040656,0.965429,0.250996,1.083526,1.430386,alert,drowsy,0.012597,0.987403
