In [12]:
# ==============================================================
# PROJECT: Therapeutic Failure Phenotype Discovery
# PHASE: Classification Data Prep + Multi-Class Balancing
# ==============================================================

import os
import numpy as np
import pandas as pd
import joblib

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# --------------------------------------------------------------
# PATHS
# --------------------------------------------------------------
BASE_DIR   = r"D:\ML_Project"
DATA_PATH  = os.path.join(BASE_DIR, "data", "processed", "PROJECT_CLUSTERED_SEVERITY_DATA.csv")
MODELS_DIR = os.path.join(BASE_DIR, "models")

os.makedirs(MODELS_DIR, exist_ok=True)

print("Loaded preprocessing + balancing script.")








Loaded preprocessing + balancing script.


In [13]:
# --------------------------------------------------------------
# 1) LOAD DATA
# --------------------------------------------------------------
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Loaded rows:", len(df))

# Convert clusters → phenotype
cluster_map = {
    "Cluster_0": "Critical_Failure",
    "Cluster_1": "Hospitalization_Failure",
    "Cluster_2": "SideEffect_Failure"
}
df["failure_phenotype_label"] = df["failure_phenotype_label"].replace(cluster_map)

# Drop useless columns
drop_cols = [
    "primaryid","caseid","caseversion","fda_dt_parsed",
    "severity_category","severity_weight",
    "failure_phenotype","is_failure",
    "all_reaction_pts"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

print("Final feature frame:", df.shape)



Loaded rows: 249048
Final feature frame: (249048, 21)


In [14]:
# --------------------------------------------------------------
# 2) DEFINE X, y
# --------------------------------------------------------------
X = df.drop(columns=["failure_phenotype_label"])
y = df["failure_phenotype_label"]

joblib.dump(X.columns.tolist(), os.path.join(MODELS_DIR, "training_feature_names.joblib"))
print("Saved training_feature_names.joblib")


# --------------------------------------------------------------
# 3) LABEL ENCODE TARGET
# --------------------------------------------------------------
le = LabelEncoder()
y_enc = le.fit_transform(y)
joblib.dump(le, os.path.join(MODELS_DIR, "label_encoder.joblib"))
print("Saved label_encoder.joblib")


# --------------------------------------------------------------
# 4) SCALE NUMERIC FEATURES
# --------------------------------------------------------------
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

joblib.dump(scaler, os.path.join(MODELS_DIR, "scaler.joblib"))
print("Saved scaler.joblib")


# --------------------------------------------------------------
# 5) TRAIN/TEST SPLIT
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

joblib.dump(X_test, os.path.join(MODELS_DIR, "X_test.joblib"))
joblib.dump(y_test, os.path.join(MODELS_DIR, "y_test.joblib"))

print("Saved X_test.joblib & y_test.joblib")
print("Train:", X_train.shape, "Test:", X_test.shape)


# --------------------------------------------------------------
# 6) MULTI-CLASS BALANCING (Correct SMOTE + UnderSample)
# --------------------------------------------------------------
print("\nApplying multi-class SMOTE + undersampling...")

# Count classes
unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))
print("Original class distribution:", class_counts)

# --------------------
# SMOTE Strategy
# Oversample all classes to the size of the largest class.
# --------------------
max_target = max(class_counts.values())

smote_strategy = {cls: max_target for cls in class_counts.keys()}
print("SMOTE strategy:", smote_strategy)

oversample = SMOTE(
    sampling_strategy=smote_strategy,
    random_state=42
)

# --------------------
# Under-sample Strategy:
# Reduce ALL classes to 85% of max size (prevents overfitting)
# --------------------
under_strategy = {cls: int(max_target * 0.85) for cls in class_counts.keys()}
print("Under-sampling strategy:", under_strategy)

undersample = RandomUnderSampler(
    sampling_strategy=under_strategy,
    random_state=42
)

Saved training_feature_names.joblib
Saved label_encoder.joblib
Saved scaler.joblib
Saved X_test.joblib & y_test.joblib
Train: (199238, 20) Test: (49810, 20)

Applying multi-class SMOTE + undersampling...
Original class distribution: {np.int64(0): np.int64(169291), np.int64(1): np.int64(14130), np.int64(2): np.int64(15817)}
SMOTE strategy: {np.int64(0): np.int64(169291), np.int64(1): np.int64(169291), np.int64(2): np.int64(169291)}
Under-sampling strategy: {np.int64(0): 143897, np.int64(1): 143897, np.int64(2): 143897}


In [15]:
# --------------------
# Pipeline
# --------------------
balance_pipe = Pipeline([
    ("smote", oversample),
    ("under", undersample)
])

X_train_bal, y_train_bal = balance_pipe.fit_resample(X_train, y_train)

print("Balanced class distribution:", dict(zip(*np.unique(y_train_bal, return_counts=True))))
print("Balanced shapes:", X_train_bal.shape, y_train_bal.shape)


# --------------------------------------------------------------
# SAVE BALANCED DATA
# --------------------------------------------------------------
joblib.dump(X_train_bal, os.path.join(MODELS_DIR, "X_train_bal.joblib"))
joblib.dump(y_train_bal, os.path.join(MODELS_DIR, "y_train_bal.joblib"))

print("\nBalanced training data saved:")
print(" → X_train_bal.joblib")
print(" → y_train_bal.joblib")

print("\nPREPROCESSING + BALANCING COMPLETE.")

Balanced class distribution: {np.int64(0): np.int64(143897), np.int64(1): np.int64(143897), np.int64(2): np.int64(143897)}
Balanced shapes: (431691, 20) (431691,)

Balanced training data saved:
 → X_train_bal.joblib
 → y_train_bal.joblib

PREPROCESSING + BALANCING COMPLETE.


In [16]:

print("\n\n=== RUNNING UNTUNED BASELINE MODELS (for selection) ===")

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score, accuracy_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load balanced + test data
X_train_bal = joblib.load(os.path.join(MODELS_DIR, "X_train_bal.joblib"))
y_train_bal = joblib.load(os.path.join(MODELS_DIR, "y_train_bal.joblib"))
X_test      = joblib.load(os.path.join(MODELS_DIR, "X_test.joblib"))
y_test      = joblib.load(os.path.join(MODELS_DIR, "y_test.joblib"))

n_classes = len(np.unique(y_train_bal))


# --------------------------------------------------------------
# DEFINE UN-TUNED MODELS
# --------------------------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=4000, class_weight="balanced"),

    "Decision Tree": DecisionTreeClassifier(),

    "Random Forest": RandomForestClassifier(class_weight="balanced"),

    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=n_classes,
        tree_method="hist",
        device="cuda",
        eval_metric="mlogloss"
    ),

    "LightGBM": LGBMClassifier(
        objective="multiclass",
        num_class=n_classes,
        device="gpu"
    ),
}

results = []

# --------------------------------------------------------------
# TRAIN AND EVALUATE UNTUNED MODELS
# --------------------------------------------------------------
for name, model in models.items():
    print("\n" + "-"*70)
    print(f"Training Untuned Model → {name}")

    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)

    try:
        y_proba = model.predict_proba(X_test)
    except:
        # some models like LinearSVC do not support predict_proba
        y_proba = model.decision_function(X_test)

    f1 = f1_score(y_test, y_pred, average="macro")
    acc = accuracy_score(y_test, y_pred)
    pr_auc = average_precision_score(y_test, y_proba, average="macro")

    print(f"Untuned F1: {f1:.4f} | Acc: {acc:.4f} | PR-AUC: {pr_auc:.4f}")

    results.append([name, f1, acc, pr_auc])

# Convert into DataFrame
df_results = pd.DataFrame(results, columns=["Model", "F1", "Accuracy", "PR-AUC"])
display(df_results)

# --------------------------------------------------------------
# SELECT TOP 4 MODELS (based on F1)
# --------------------------------------------------------------
top5 = df_results.sort_values("F1", ascending=False).head(5)["Model"].tolist()
print("\nTop 5 models selected:", top5)

# Save the actual models
top5_models = {m: models[m] for m in top5}
joblib.dump(top5_models, os.path.join(MODELS_DIR, "top5_untuned_models.joblib"))

# Save the names for the next notebook
joblib.dump(top5, os.path.join(MODELS_DIR, "top5_model_names.joblib"))

print("\nSaved:")
print(" → top5_untuned_models.joblib")
print(" → top5_model_names.joblib")

print("\nBASELINE MODEL SELECTION COMPLETE.")




=== RUNNING UNTUNED BASELINE MODELS (for selection) ===

----------------------------------------------------------------------
Training Untuned Model → Logistic Regression
Untuned F1: 0.7684 | Acc: 0.8225 | PR-AUC: 0.8059

----------------------------------------------------------------------
Training Untuned Model → Decision Tree
Untuned F1: 0.8012 | Acc: 0.8876 | PR-AUC: 0.7446

----------------------------------------------------------------------
Training Untuned Model → Random Forest
Untuned F1: 0.8079 | Acc: 0.8901 | PR-AUC: 0.8048

----------------------------------------------------------------------
Training Untuned Model → XGBoost
Untuned F1: 0.8068 | Acc: 0.8757 | PR-AUC: 0.8394

----------------------------------------------------------------------
Training Untuned Model → LightGBM
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1059
[LightGBM] [Info] Number of data points in the train set: 431691, number of used features: 20
[LightGBM] [Info] Us

Unnamed: 0,Model,F1,Accuracy,PR-AUC
0,Logistic Regression,0.768411,0.822526,0.805856
1,Decision Tree,0.801228,0.887573,0.744584
2,Random Forest,0.807902,0.890062,0.804781
3,XGBoost,0.80675,0.875708,0.839386
4,LightGBM,0.806385,0.876511,0.837157



Top 5 models selected: ['Random Forest', 'XGBoost', 'LightGBM', 'Decision Tree', 'Logistic Regression']

Saved:
 → top5_untuned_models.joblib
 → top5_model_names.joblib

BASELINE MODEL SELECTION COMPLETE.
