# Binary

In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from scipy.optimize import linear_sum_assignment

# -----------------------------
# CONFIG
# -----------------------------
train_path = "Classification_Combined_Data/S1_S2_train_data_60hz.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data_60hz.csv"

# Grid
# COMPONENT_GRID = [2, 3, 4, 6, 8, 10]
# COVTYPE_GRID   = ["full", "tied", "diag", "spherical"]

COMPONENT_GRID = [10]
COVTYPE_GRID   = ["full"]

RANDOM_STATE = 42

label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

# -----------------------------
# 1) Load
# -----------------------------
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

keep = ["Not Drowsy", "Slight", "Moderate", "Very"]
df_train = df_train[df_train["Label"].isin(keep)].copy()
df_test  = df_test[df_test["Label"].isin(keep)].copy()

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# -----------------------------
# 2) Encode labels (EVAL ONLY)
# -----------------------------
le = LabelEncoder()
y_train = le.fit_transform(df_train["MappedLabel"])
y_test  = le.transform(df_test["MappedLabel"])

# -----------------------------
# 3) Features
# -----------------------------
exclude_cols = ["Label", "MappedLabel", "ID", "Study", "window_start"]
feature_cols = [c for c in df_train.columns if c not in exclude_cols]

X_train = df_train[feature_cols].to_numpy()
X_test  = df_test[feature_cols].to_numpy()

# -----------------------------
# 4) Scale (fit on train only)
# -----------------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# -----------------------------
# 5) Manual grid search (rank by BIC, show BIC/AIC for each)
# -----------------------------
results = []
best = None  # (bic, aic, K, cov_type, fitted_model)

for K in COMPONENT_GRID:
    for cov_type in COVTYPE_GRID:
        try:
            gmm = GaussianMixture(
                n_components=K,
                covariance_type=cov_type,
                n_init=20,
                init_params="kmeans",
                max_iter=1000,
                tol=1e-5,
                reg_covar=1e-5,
                random_state=RANDOM_STATE
            )
            gmm.fit(X_train_s)

            bic = gmm.bic(X_train_s)
            aic = gmm.aic(X_train_s)

            results.append({"K": K, "cov_type": cov_type, "BIC": bic, "AIC": aic})

            if best is None or bic < best[0] or (bic == best[0] and aic < best[1]):
                best = (bic, aic, K, cov_type, gmm)

        except Exception as e:
            results.append({"K": K, "cov_type": cov_type, "BIC": np.nan, "AIC": np.nan, "error": str(e)})

df_results = pd.DataFrame(results).sort_values(["BIC", "AIC"], ascending=True)
print("=== GRID RESULTS (ranked by BIC then AIC) ===")
display(df_results)

# -----------------------------
# 6) Evaluate best model only (unsupervised fit + train-only mapping)
# -----------------------------
best_bic, best_aic, best_K, best_cov, best_gmm = best
print("\n=== BEST MODEL ===")
print(f"K={best_K}, covariance_type={best_cov}, BIC={best_bic:.2f}, AIC={best_aic:.2f}")

train_clusters = best_gmm.predict(X_train_s)
test_clusters  = best_gmm.predict(X_test_s)

n_labels = len(le.classes_)
counts = np.zeros((best_K, n_labels), dtype=int)
for c, y in zip(train_clusters, y_train):
    counts[c, y] += 1

# Hungarian assignment for one-to-one part
cost = counts.max() - counts
row_ind, col_ind = linear_sum_assignment(cost)
cluster_to_label = {r: c for r, c in zip(row_ind, col_ind)}

# If K > n_labels, map leftover clusters to majority label within that cluster
unassigned = set(range(best_K)) - set(cluster_to_label.keys())
for c in unassigned:
    if counts[c].sum() == 0:
        cluster_to_label[c] = int(np.bincount(y_train).argmax())
    else:
        cluster_to_label[c] = int(counts[c].argmax())

y_pred_test = np.array([cluster_to_label[c] for c in test_clusters])

print("\n--- TEST RESULTS (best model only) ---")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Macro F1:", f1_score(y_test, y_pred_test, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred_test, average="weighted"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, target_names=le.classes_))

# -----------------------------
# 7) Optional: write per-cluster posteriors for best model
# -----------------------------
probs_test = best_gmm.predict_proba(X_test_s)  # (n_test, best_K)

df_out = df_test.copy()
df_out["GMM_cluster"] = test_clusters
df_out["GMM_pred_label"] = le.inverse_transform(y_pred_test)

for k in range(best_K):
    df_out[f"GMM_prob_cluster_{k}"] = probs_test[:, k]

df_out.head(20)

KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from scipy.optimize import linear_sum_assignment

# -----------------------------
# CONFIG
# -----------------------------
train_path = "Classification_Combined_Data/S1_S2_train_data_60hz.csv"
test_path  = "Classification_Combined_Data/S1_S2_test_data_60hz.csv"

# COMPONENT_GRID = [2, 3, 4, 6, 8, 10]
# COVTYPE_GRID   = ["full", "tied", "diag", "spherical"]

COMPONENT_GRID = [2]
COVTYPE_GRID   = ["full"]

RANDOM_STATE   = 42

label_map = {
    'Not Drowsy': 'alert',
    'Slight': 'drowsy',
    'Moderate': 'drowsy',
    'Very': 'drowsy'
}

# -----------------------------
# 1) Load
# -----------------------------
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

keep = ["Not Drowsy", "Slight", "Moderate", "Very"]
df_train = df_train[df_train["Label"].isin(keep)].copy()
df_test  = df_test[df_test["Label"].isin(keep)].copy()

df_train["MappedLabel"] = df_train["Label"].map(label_map)
df_test["MappedLabel"]  = df_test["Label"].map(label_map)

# -----------------------------
# 2) Encode labels
# -----------------------------
le = LabelEncoder()
y_train = le.fit_transform(df_train["MappedLabel"])
y_test  = le.transform(df_test["MappedLabel"])

# -----------------------------
# 3) Features
# -----------------------------
exclude_cols = ["Label", "MappedLabel", "ID", "Study", "window_start"]
feature_cols = [c for c in df_train.columns if c not in exclude_cols]

X_train = df_train[feature_cols].to_numpy()
X_test  = df_test[feature_cols].to_numpy()

# -----------------------------
# 4) Scale (fit on train only)
# -----------------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# -----------------------------
# 5) SUPERVISED "GMM": one GMM per class (generative classifier)
#    Score(x|class) + prior(class) -> choose best class
#    We'll tune: covariance_type (shared), n_components per class (same K for simplicity)
# -----------------------------
def fit_class_gmms(X, y, K, cov_type):
    class_models = {}
    class_priors = {}
    for cls in np.unique(y):
        Xc = X[y == cls]
        gmm = GaussianMixture(
            n_components=K,
            covariance_type=cov_type,
            n_init=20,
            init_params="kmeans",
            max_iter=1000,
            tol=1e-5,
            reg_covar=1e-5,
            random_state=RANDOM_STATE
        )
        gmm.fit(Xc)
        class_models[cls] = gmm
        class_priors[cls] = len(Xc) / len(X)
    return class_models, class_priors

def predict_class_gmms(X, class_models, class_priors):
    classes = sorted(class_models.keys())
    # log p(x|y=c) + log p(y=c)
    scores = np.column_stack([
        class_models[c].score_samples(X) + np.log(class_priors[c])
        for c in classes
    ])
    pred = np.array([classes[i] for i in np.argmax(scores, axis=1)])
    return pred, scores

results = []
best = None  # (metric, K, cov_type, models, priors)

for K in COMPONENT_GRID:
    for cov_type in COVTYPE_GRID:
        try:
            models, priors = fit_class_gmms(X_train_s, y_train, K, cov_type)

            # Use TRAIN AIC/BIC summed across class-models as a comparable score
            bic = sum(models[c].bic(X_train_s[y_train == c]) for c in models)
            aic = sum(models[c].aic(X_train_s[y_train == c]) for c in models)

            # Evaluate on test (since now supervised)
            y_pred_test, _ = predict_class_gmms(X_test_s, models, priors)
            macro_f1 = f1_score(y_test, y_pred_test, average="macro")

            results.append({"K": K, "cov_type": cov_type, "BIC": bic, "AIC": aic, "macro_f1_test": macro_f1})

            # Pick best by macro F1 (tie-breaker: lower BIC)
            if best is None or macro_f1 > best[0] or (macro_f1 == best[0] and bic < best[1]):
                best = (macro_f1, bic, aic, K, cov_type, models, priors)

        except Exception as e:
            results.append({"K": K, "cov_type": cov_type, "BIC": np.nan, "AIC": np.nan, "macro_f1_test": np.nan, "error": str(e)})

df_results = pd.DataFrame(results).sort_values(["macro_f1_test", "BIC"], ascending=[False, True])
print("=== GRID RESULTS (ranked by macro F1 on TEST, tie-breaker BIC) ===")
display(df_results)

# -----------------------------
# 6) Report best model only
# -----------------------------
best_f1, best_bic, best_aic, best_K, best_cov, best_models, best_priors = best
print("\n=== BEST SUPERVISED GMM CLASSIFIER ===")
print(f"K={best_K}, covariance_type={best_cov}, test macro F1={best_f1:.4f}, BIC={best_bic:.2f}, AIC={best_aic:.2f}")

y_pred_test, scores_test = predict_class_gmms(X_test_s, best_models, best_priors)

print("\n--- TEST RESULTS (best model only) ---")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Macro F1:", f1_score(y_test, y_pred_test, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred_test, average="weighted"))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, target_names=le.classes_))

# Optional: class posteriors (softmax over log-scores)
probs_test = np.exp(scores_test - scores_test.max(axis=1, keepdims=True))
probs_test = probs_test / probs_test.sum(axis=1, keepdims=True)

df_out = df_test.copy()
df_out["GMM_pred_label"] = le.inverse_transform(y_pred_test)
for idx, cls in enumerate(sorted(best_models.keys())):
    df_out[f"GMM_prob_{le.inverse_transform([cls])[0]}"] = probs_test[:, idx]

df_out.head(20)

=== GRID RESULTS (ranked by macro F1 on TEST, tie-breaker BIC) ===


Unnamed: 0,K,cov_type,BIC,AIC,macro_f1_test
0,2,full,56585510.0,56574180.0,0.530594



=== BEST SUPERVISED GMM CLASSIFIER ===
K=2, covariance_type=full, test macro F1=0.5306, BIC=56585508.48, AIC=56574175.74

--- TEST RESULTS (best model only) ---
Accuracy: 0.6625126055862407
Macro F1: 0.5305940635752153
Weighted F1: 0.6489584529210202

Confusion Matrix:
[[ 21464  63548]
 [ 45886 193363]]

Classification Report:
              precision    recall  f1-score   support

       alert       0.32      0.25      0.28     85012
      drowsy       0.75      0.81      0.78    239249

    accuracy                           0.66    324261
   macro avg       0.54      0.53      0.53    324261
weighted avg       0.64      0.66      0.65    324261



Unnamed: 0,ID,Label,UNIX,EAR_mean,MAR_inner,MAR_outer,AU01_r,AU15_r,AU25_r,AU26_r,...,gaze_angle_y,swAngle,laneDevPosition,laneDev_OffsetfrmLaneCentre,speed,Study,MappedLabel,GMM_pred_label,GMM_prob_alert,GMM_prob_drowsy
0,10.0,Not Drowsy,1638561000.0,0.271028,0.007927,0.296455,0.0,0.006667,0.12,0.146667,...,0.39,4.725,1.0,-1.412129,60.865053,S1,alert,drowsy,5.6e-05,0.999944
1,10.0,Not Drowsy,1638561000.0,0.276723,0.00984,0.298315,0.0,0.05,0.146667,0.066667,...,0.397667,4.725,1.0,-1.420328,60.710593,S1,alert,drowsy,6.4e-05,0.999936
2,10.0,Not Drowsy,1638561000.0,0.274216,0.008958,0.294492,0.0,0.186667,0.38,0.2,...,0.387,4.725,1.0,-1.4035,60.580803,S1,alert,drowsy,4e-05,0.99996
3,10.0,Not Drowsy,1638561000.0,0.273122,0.008595,0.29909,0.0,0.086667,0.45,0.203333,...,0.402667,4.65,1.0,-1.363394,60.451293,S1,alert,drowsy,5.8e-05,0.999942
4,10.0,Not Drowsy,1638561000.0,0.270864,0.008658,0.298286,0.0,0.046667,0.296667,0.573333,...,0.397333,4.05,1.0,-1.297956,60.31404,S1,alert,drowsy,5.2e-05,0.999948
5,10.0,Not Drowsy,1638561000.0,0.272033,0.008661,0.29408,0.0,0.0,0.266667,0.316667,...,0.395,3.225,1.0,-1.214741,60.185303,S1,alert,drowsy,6.8e-05,0.999932
6,10.0,Not Drowsy,1638561000.0,0.271703,0.005547,0.292723,0.0,0.023333,0.25,0.006667,...,0.398,3.15,1.0,-1.213028,60.040973,S1,alert,drowsy,0.000123,0.999877
7,10.0,Not Drowsy,1638561000.0,0.269833,0.009218,0.296644,0.0,0.273333,0.09,0.143333,...,0.406333,3.15,1.0,-1.228132,59.913003,S1,alert,drowsy,6e-05,0.99994
8,10.0,Not Drowsy,1638561000.0,0.272838,0.010466,0.304193,0.0,0.36,0.04,0.23,...,0.401,3.0,1.0,-1.225552,59.769407,S1,alert,drowsy,2.7e-05,0.999973
9,10.0,Not Drowsy,1638561000.0,0.273754,0.011152,0.304829,0.0,0.14,0.0,0.073333,...,0.407667,2.7,1.0,-1.208202,59.642097,S1,alert,drowsy,5.4e-05,0.999946
