In [1]:
# Complete ANN project template for CSC3034 Assignment 2
#- Data loading & basic EDA
#- Preprocessing (missing values, encoding, scaling)
#- Optional PCA
#- Train/test split
#- Build and train ANN (TensorFlow / Keras)
#- Model evaluation: accuracy, confusion matrix, AUC-ROC (binary/multiclass)
#- Cross-validation (StratifiedKFold)
#- Repetitive testing + Optuna hyperparameter tuning
#- Visualization: learning curves, confusion matrix, ROC curve
#- Save model and artifacts

#USAGE:
#    - Edit DATA_PATH, TARGET_COL and basic settings below (marked TODO)
#    - Run: python src/main.py
#    - Or import functions into notebooks (recommended for step-by-step work)

In [2]:
# pip install optuna

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve
)
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier

ModuleNotFoundError: No module named 'numpy'

In [None]:
# Optional: Optuna (only if installed)
try:
    import optuna
    OPTUNA = True
except ImportError:
    OPTUNA = False
    print("Optuna not installed — skipping hyperparameter tuning.")

In [None]:
# ================================
# 1. CONFIG
# ================================
DATA_PATH = "diabetes.csv"   # TODO — change to your file path
SAVE_DIR = "outputs"              # auto-created folder to save graphs

os.makedirs(SAVE_DIR, exist_ok=True)

ZERO_AS_MISSING = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
TARGET_COL = "Outcome"

In [None]:
# ================================
# 2. LOAD DATA
# ================================
def load_data(path):
    df = pd.read_csv(path)
    print("Dataset loaded:", df.shape)
    # --- Formatting options ---
    pd.set_option('display.colheader_justify', 'center')

    styled_df = df.head().style.set_properties(**{'text-align': 'center'}).set_table_styles([
        {"selector": "th", "props": [("border", "1px solid black"), ("text-align", "center")]},
        {"selector": "td", "props": [("border", "1px solid black"), ("text-align", "center")]}
    ])

    display(styled_df)  # shows beautifully formatted output
    return df

In [None]:
# ================================
# 3. CLEANING (Zero → Missing)
# ================================
def replace_zeros(df):
    df = df.copy()
    for col in ZERO_AS_MISSING:
        df[col] = df[col].replace(0, np.nan)
    print("\nMissing values after zero→NaN replacement:\n", df.isna().sum())
    return df

In [None]:
# ================================
# 4. IMPUTE + SCALE
# ================================
def preprocess(df):
    X = df.drop(columns=[TARGET_COL])
    y = df[TARGET_COL].values

    # impute missing values (median)
    imputer = SimpleImputer(strategy="median")
    X_imputed = imputer.fit_transform(X)

    # scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    return X_scaled, y, X.columns.tolist(), imputer, scaler

In [None]:
# ================================
# 5. BASIC EDA (SAVED FIGURES)
# ================================
def run_eda(df):
    # Outcome distribution
    plt.figure(figsize=(5,4))
    sns.countplot(x=TARGET_COL, data=df)
    plt.title("Outcome Distribution (0 = No Diabetes, 1 = Diabetes)")
    plt.savefig(os.path.join(SAVE_DIR, "outcome_distribution.png"))
    plt.show()
    plt.close()

    # Histograms
    df.hist(bins=20, figsize=(12,10))
    plt.tight_layout()
    plt.savefig(os.path.join(SAVE_DIR, "histograms.png"))
    plt.show()
    plt.close()

    # Correlation heatmap
    plt.figure(figsize=(10,8))
    sns.heatmap(df.corr(), annot=False, cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.savefig(os.path.join(SAVE_DIR, "correlation_heatmap.png"))
    plt.show()
    plt.close()

In [None]:
# ================================
# 6. TRAIN ANN (SKLEARN)
# ================================
def train_ann(X_train, y_train, X_test, y_test, params, run_name):
    """
    params example:
    {
        'hidden_layer_sizes': (32, 32),
        'activation': 'relu',
        'learning_rate_init': 0.001
    }
    """

    mlp = MLPClassifier(
        hidden_layer_sizes=params["hidden_layer_sizes"],
        activation=params["activation"],
        learning_rate_init=params["learning_rate_init"],
        max_iter=2000,
        random_state=42
    )

    mlp.fit(X_train, y_train)

    # predictions
    y_pred = mlp.predict(X_test)
    y_prob = mlp.predict_proba(X_test)[:,1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print("\n===== FINAL RESULTS =====")
    print("Accuracy:", acc)
    print("AUC:", auc)
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # save plots
    plot_confusion(confusion_matrix(y_test, y_pred), run_name)
    plot_roc(y_test, y_prob, run_name)

    return mlp, acc, auc

In [None]:
# ================================
# 7. CROSS VALIDATION
# ================================
def run_cross_validation(X, y, params, folds=5):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    accs, aucs = [], []
    fold_num = 1

    for train_idx, test_idx in skf.split(X, y):
        print(f"\n===== Fold {fold_num}/{folds} =====")
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        mlp = MLPClassifier(
            hidden_layer_sizes=params["hidden_layer_sizes"],
            activation=params["activation"],
            learning_rate_init=params["learning_rate_init"],
            max_iter=2000,
            random_state=42
        )

        mlp.fit(X_train, y_train)

        y_pred = mlp.predict(X_test)
        y_prob = mlp.predict_proba(X_test)[:,1]

        accs.append(accuracy_score(y_test, y_pred))
        aucs.append(roc_auc_score(y_test, y_prob))
    
        fold_num += 1

    print("\n===== CROSS VALIDATION RESULTS =====")
    print("Accuracies per fold:", accs)
    print("AUC per fold:", aucs)
    print("Mean Accuracy:", np.mean(accs))
    print("Mean AUC:", np.mean(aucs))

In [None]:
# ================================
# 8. HYPERPARAMETER TUNING (Optuna)
# ================================
def optuna_objective(trial, X, y):
    hidden_layers = trial.suggest_categorical("hidden_layer_sizes", [(16,), (32,), (32,32), (64,32)])
    activation = trial.suggest_categorical("activation", ["relu", "tanh"])
    lr = trial.suggest_float("learning_rate_init", 1e-4, 1e-2, log=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
        test_size=0.2, stratify=y, random_state=42)

    mlp = MLPClassifier(
        hidden_layer_sizes=hidden_layers,
        activation=activation,
        learning_rate_init=lr,
        max_iter=2000,
        random_state=42
    )
    mlp.fit(X_train, y_train)

    y_prob = mlp.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_prob)

    return auc


def run_optuna(X, y, trials=20):
    if not OPTUNA:
        print("Optuna unavailable.")
        return None

    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: optuna_objective(trial, X, y),
        n_trials=trials,
        show_progress_bar=True
    )

    print("\n===== OPTUNA BEST PARAMETERS =====")
    print(study.best_params)

    return study.best_params

In [None]:
# ================================
# 9. PLOTTING FUNCTIONS
# ================================
def plot_confusion(cm, run):
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    plt.title("Confusion Matrix")
    plt.savefig(os.path.join(SAVE_DIR, f"{run}_confusion.png"))
    plt.show()
    plt.close()

def plot_roc(y_true, y_prob, run):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_true,y_prob):.3f}")
    plt.plot([0,1], [0,1], "--", color="gray")
    plt.title("ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.savefig(os.path.join(SAVE_DIR, f"{run}_roc.png"))
    plt.show()
    plt.close()

In [None]:
# ================================
# 10. MAIN PIPELINE
# ================================
def main():

    # 1. Load dataset
    df = load_data(DATA_PATH)

    # 2. EDA
    run_eda(df)

    # 3. Replace zero values
    df2 = replace_zeros(df)

    # 4. Preprocess
    X, y, feature_names, imputer, scaler = preprocess(df2)

    # 5. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # 6. Default ANN Settings
    params = {
        "hidden_layer_sizes": (32, 32),        # two hidden layers with 32 neurons
        "activation": "relu",
        "learning_rate_init": 0.001
    }

    # 7. Cross-validation (optional)
    run_cv = False  # TODO: set True if needed
    if run_cv:
        run_cross_validation(X, y, params)

    # 8. Optuna tuning (optional)
    if OPTUNA:
        use_optuna = False  # TODO: set True if you want tuning
        if use_optuna:
            best = run_optuna(X, y, trials=20)
            if best:
                params = best

    # 9. Final model training
    model, acc, auc = train_ann(
        X_train, y_train,
        X_test, y_test,
        params,
        run_name="final_pima"
    )

    print("\nDONE — check outputs folder for results!")


if __name__ == "__main__":
    main()    