In [7]:
# ===============================
# 0. Imports
# ===============================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    precision_recall_fscore_support
)

# ===============================
# 1. Load data
# ===============================
df = pd.read_csv("EN_combined_train_with_labels.csv.gz")
print(df.shape)
display(df.head())

# ===============================
# 2. Define target and features
# ===============================

# Target: psd_label_dcr (convert bool -> int)
y = df["psd_label_dcr"].astype(int)

# Columns we do NOT want as features
drop_cols = [
    "id",
    "energy_label",
    "psd_label_low_avse",
    "psd_label_high_avse",
    "psd_label_dcr",   # target
    "psd_label_lq",
]

# If you want a "baseline without Jade features", you can uncomment this:
# drop_cols += ["AvsE", "GradAreaRatio", "GradWidthMain", "HFER"]

feature_cols = [c for c in df.columns if c not in drop_cols]
X = df[feature_cols]

print("Feature columns:", feature_cols)
print("X shape:", X.shape, "   y shape:", y.shape)

print("\nClass balance for psd_label_dcr:")
print(y.value_counts())
print(y.value_counts(normalize=True))

# ===============================
# 3. Train–test split (stratified)
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("\nTrain size:", X_train.shape[0], " Test size:", X_test.shape[0])
print("Train class balance:")
print(y_train.value_counts(normalize=True))
print("Test class balance:")
print(y_test.value_counts(normalize=True))


# ===============================
# 4. Helper: evaluation function
# ===============================
def evaluate_classifier(name, model, X_test, y_test):
    """
    Prints metrics appropriate for imbalanced classification:
    - confusion matrix
    - precision, recall, f1 for each class
    - ROC AUC
    - PR AUC (Average Precision)
    """
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
    else:
        y_score = None

    print("\n" + "="*70)
    print(f"Evaluation: {name}")
    print("="*70)

    print("Confusion matrix (rows = true, cols = predicted):")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification report:")
    print(classification_report(y_test, y_pred, target_names=["neg (0)", "pos (1)"]))

    if y_score is not None:
        roc = roc_auc_score(y_test, y_score)
        ap  = average_precision_score(y_test, y_score)
        print(f"ROC AUC: {roc:.4f}")
        print(f"PR AUC (Average Precision): {ap:.4f}")
    else:
        print("No probability scores available for ROC/PR AUC.")


# ===============================
# 5. Baseline Logistic Regression
# ===============================
baseline_clf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # handle NaNs
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        max_iter=1000,
        n_jobs=-1,
        class_weight=None,    # pure baseline, no class balancing
        solver="lbfgs"
    )),
])

baseline_clf.fit(X_train, y_train)
evaluate_classifier("Baseline Logistic Regression (no class balancing)", baseline_clf, X_test, y_test)


# ===============================
# 6. Tuned Logistic Regression (class_weight='balanced')
# ===============================
logreg_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        max_iter=1000,
        n_jobs=-1,
        class_weight="balanced",   # upweight minority class
        solver="lbfgs"
    )),
])

param_grid_logreg = {
    "logreg__C": [0.01, 0.1, 1.0, 10.0]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

logreg_cv = GridSearchCV(
    estimator=logreg_pipe,
    param_grid=param_grid_logreg,
    scoring="average_precision",   # PR AUC is good for imbalanced data
    cv=cv,
    n_jobs=-1,
    verbose=2
)

logreg_cv.fit(X_train, y_train)

print("\nBest params (LogReg):", logreg_cv.best_params_)
print("Best CV PR AUC (LogReg):", logreg_cv.best_score_)

best_logreg = logreg_cv.best_estimator_
evaluate_classifier("LogReg (class_weight balanced, tuned C)", best_logreg, X_test, y_test)


# ===============================
# 7. Random Forest (class_weight='balanced') with tuning
# ===============================
rf_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # still need to handle NaNs
    ("rf", RandomForestClassifier(
        class_weight="balanced",
        n_estimators=200,
        n_jobs=-1,
        random_state=42
    )),
])

param_grid_rf = {
    "rf__n_estimators": [100, 200],
    "rf__max_depth": [None, 10, 20],
    "rf__min_samples_split": [2, 10],
    "rf__min_samples_leaf": [1, 5],
}

rf_cv = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid_rf,
    scoring="average_precision",
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2
)

rf_cv.fit(X_train, y_train)

print("\nBest params (RF):", rf_cv.best_params_)
print("Best CV PR AUC (RF):", rf_cv.best_score_)

best_rf = rf_cv.best_estimator_
evaluate_classifier("Random Forest (class_weight balanced, tuned)", best_rf, X_test, y_test)


# ===============================
# 8. (Optional) Summary table for report
# ===============================
results = []

def record_result(name, model, X_test, y_test):
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
    else:
        y_score = None

    y_pred = model.predict(X_test)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )
    roc = roc_auc_score(y_test, y_score) if y_score is not None else np.nan
    ap  = average_precision_score(y_test, y_score) if y_score is not None else np.nan

    results.append({
        "model": name,
        "precision_pos": precision,
        "recall_pos": recall,
        "f1_pos": f1,
        "roc_auc": roc,
        "pr_auc": ap,
    })

record_result("Baseline Logistic", baseline_clf, X_test, y_test)
record_result("LogReg balanced + tuned", best_logreg, X_test, y_test)
record_result("RandomForest balanced + tuned", best_rf, X_test, y_test)

results_df = pd.DataFrame(results)
display(results_df)


(1040000, 29)


Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,late_over_early,tdrift99,tfr,peak_count,gbn,bpr,AvsE,GradAreaRatio,GradWidthMain,HFER
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2299.0,-717094.898532,...,0.98791,85.0,0.142357,3,1.198436,0.059642,0.519805,1.0,76.0,0.034772
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2446.0,-331957.541919,...,0.988301,87.0,0.15116,3,1.381123,0.061975,0.531775,1.0,46.0,0.037621
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2262.0,-425532.152706,...,0.987491,95.0,0.142606,3,1.361857,0.050813,0.343676,1.0,90.0,0.035552
3,3_train_0,240.87811,False,True,True,False,927,3408.0,2833.0,-306980.459766,...,0.98845,116.0,0.133192,2,1.165654,0.062954,0.483144,1.0,56.0,0.035093
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2397.0,-362746.925366,...,0.988541,94.0,0.145504,7,1.390918,0.055013,0.377738,1.0,53.0,0.035971


Feature columns: ['tp0', 'ED', 'HWP', 'LQ80', 'PPR', 'SC', 'current_skewness', 'spectral_centroid_power', 'tail_charge_diff', 'current_kurtosis', 'total_power', 'time_to_main_peak', 'time_to_peak', 'late_over_early', 'tdrift99', 'tfr', 'peak_count', 'gbn', 'bpr', 'AvsE', 'GradAreaRatio', 'GradWidthMain', 'HFER']
X shape: (1040000, 23)    y shape: (1040000,)

Class balance for psd_label_dcr:
psd_label_dcr
1    1019930
0      20070
Name: count, dtype: int64
psd_label_dcr
1    0.980702
0    0.019298
Name: proportion, dtype: float64

Train size: 832000  Test size: 208000
Train class balance:
psd_label_dcr
1    0.980702
0    0.019298
Name: proportion, dtype: float64
Test class balance:
psd_label_dcr
1    0.980702
0    0.019298
Name: proportion, dtype: float64

Evaluation: Baseline Logistic Regression (no class balancing)
Confusion matrix (rows = true, cols = predicted):
[[    85   3929]
 [    34 203952]]

Classification report:
              precision    recall  f1-score   support

     neg




Best params (LogReg): {'logreg__C': 10.0}
Best CV PR AUC (LogReg): 0.9899394187097043

Evaluation: LogReg (class_weight balanced, tuned C)
Confusion matrix (rows = true, cols = predicted):
[[  2110   1904]
 [ 38534 165452]]

Classification report:
              precision    recall  f1-score   support

     neg (0)       0.05      0.53      0.09      4014
     pos (1)       0.99      0.81      0.89    203986

    accuracy                           0.81    208000
   macro avg       0.52      0.67      0.49    208000
weighted avg       0.97      0.81      0.88    208000

ROC AUC: 0.7175
PR AUC (Average Precision): 0.9896
Fitting 3 folds for each of 24 candidates, totalling 72 fits


  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
Traceback (most recent call last):
  File "/Users/jadechoi/miniforge3/lib/python3.10/sit

KeyboardInterrupt: 