<a href="https://colab.research.google.com/github/akpanitorobong/7135CEM-Modelling_and_Optimization_Under_Uncertainty/blob/main/7135_CEM_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install Libraries**

In [None]:
print("Installing Libraries...")

# Install necessary packages
!pip install ucimlrepo  # For dataset handling
!pip install imbalanced-learn  # For class balancing
!pip install scikit-optimize

# Import Required Libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # Enhanced visualization
import time  # For tracking execution time

from ucimlrepo import fetch_ucirepo  # For dataset import

from imblearn.under_sampling import RandomUnderSampler# Class balancing

# Feature Scaling & Dimensionality Reduction
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Model Training & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

#HyperParameter Tuning
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV # Import BayesSearchCV from skopt

# Evaluation Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

print("Libraries imported successfully!")

Installing Libraries...
Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2
Libraries imported successfully!


# **Load and Inspect Dataset**

In [None]:
# **Load and Inspect Dataset**
print("Loading dataset...")

# Fetch dataset from UCI Repository
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# Extract features and target variables
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# Combine for easy data handling
df = pd.concat([X, y], axis=1)
df.to_csv('diabetes_health_indicators.csv', index=False)

print("Dataset loaded successfully.")
print("\nDataset Info:\n")
df.info()
print("\nFirst 5 Rows:\n")
df.head()
#df = df.sample(1000)

Loading dataset...
Dataset loaded successfully.

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   HighBP                253680 non-null  int64
 1   HighChol              253680 non-null  int64
 2   CholCheck             253680 non-null  int64
 3   BMI                   253680 non-null  int64
 4   Smoker                253680 non-null  int64
 5   Stroke                253680 non-null  int64
 6   HeartDiseaseorAttack  253680 non-null  int64
 7   PhysActivity          253680 non-null  int64
 8   Fruits                253680 non-null  int64
 9   Veggies               253680 non-null  int64
 10  HvyAlcoholConsump     253680 non-null  int64
 11  AnyHealthcare         253680 non-null  int64
 12  NoDocbcCost           253680 non-null  int64
 13  GenHlth               253680 non-null  int64
 14  MentHlth            

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


# **Preprocessing**

In [None]:
# Remove unnecessary column if present
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)
    print("Column 'Unnamed: 0' removed.")

# Check for and remove duplicate rows
no_of_duplicates = df.duplicated().sum()
df = df.drop_duplicates()
print(f"Number of duplicate rows removed: {no_of_duplicates}")

df = df.dropna()
print("Rows with missing values removed.")

df = df.reset_index(drop=True)
print("Index reset.")

df.head()

Number of duplicate rows removed: 24206
Rows with missing values removed.
Index reset.


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


# **Class Balancing**

In [None]:
# Define target variable
target_col = "Diabetes_binary"
X = df.drop(columns=[target_col])  # Features
y = df[target_col]  # Target variable

#Original Data
df_original = pd.DataFrame(X, columns=X.columns)
df_original[target_col] = y  # Add target column back

# Check class distribution before balancing
print("\nClass Distribution Before Balancing:")
print(y.value_counts(), "\n")

# Apply Random Undersampling to balance the dataset
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Convert back to DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced[target_col] = y_resampled  # Add target column back

print("Dataset balanced using Random Undersampling. \nNew class distribution:")
print(df_balanced[target_col].value_counts())


Class Distribution Before Balancing:
Diabetes_binary
0    194377
1     35097
Name: count, dtype: int64 

Dataset balanced using Random Undersampling. 
New class distribution:
Diabetes_binary
0    35097
1    35097
Name: count, dtype: int64


# **Visualization**

In [1]:
 # Plot class distribution before balancing
plt.figure(figsize=(6, 4))
sns.barplot(x=df_original[target_col].value_counts().index,
            y=df_original[target_col].value_counts().values,
            palette="viridis", hue = df_original[target_col].value_counts().index)
plt.xlabel("Diabetes Class (0 = No Diabetes, 1 = Diabetes)")
plt.ylabel("Number of Samples")
plt.title("Class Distribution Before Undersampling")
plt.show()

# Display the first few rows of the cleaned and processed dataset
df_balanced.head()

 # Plot class distribution after balancing
plt.figure(figsize=(6, 4))
sns.barplot(x=df_balanced[target_col].value_counts().index,
            y=df_balanced[target_col].value_counts().values,
            palette="viridis", hue = df_balanced[target_col].value_counts().index)
plt.xlabel("Diabetes Class (0 = No Diabetes, 1 = Diabetes)")
plt.ylabel("Number of Samples")
plt.title("Class Distribution After Undersampling")
plt.show()

# Display the first few rows of the cleaned and processed dataset
df_balanced.head()

NameError: name 'plt' is not defined

# **Standardization**

In [None]:
# Identify numerical columns (excluding categorical features)
num_cols = ["BMI", "MentHlth", "PhysHlth", "Age", "Education", "Income"]

# Standardize numerical features
scaler = StandardScaler()
df_balanced[num_cols] = scaler.fit_transform(df_balanced[num_cols])

print("Numerical features standardized.")
print("Rows affected: ", num_cols)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_balanced.drop(columns=[target_col]),
                                                    df_balanced[target_col],
                                                    test_size=0.2,
                                                    random_state=42)

print("\nData split into training and testing sets.")

Numerical features standardized.
Rows affected:  ['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

Data split into training and testing sets.


# **Principal Component Analysis**

In [None]:
# Fit PCA without limiting components
pca_full = PCA().fit(X_train)
print("Total components: " + str(pca_full.n_components_))

# Compute cumulative explained variance
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

# Find the number of components that retain at least 95% variance
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1  # +1 since index starts at 0
print(f"Number of components to retain 95% variance: {n_components_95}")

# Apply PCA with optimal number of components
pca = PCA(n_components=n_components_95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Check total variance retained
total_variance_retained = np.sum(pca.explained_variance_ratio_)
print(f"Total variance retained: {total_variance_retained:.2f}")

Total components: 21
Number of components to retain 95% variance: 15
Total variance retained: 0.96


# **Model Training and Evaluation**


**Model and Dataset Definition**

In [None]:
# Define datasets
datasets = {
    #"Original": (X_train, X_test),
    #"Feature Importance": (X_train_FI, X_test_FI),
    "PCA": (X_train_pca, X_test_pca),
    #"PCA + Feature Importance": (X_train_pca_fi, X_test_pca_fi)
}

# Define models
models = {
    "Gaussian Process Classifier": GaussianProcessClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Gradient Boosting Classifier": GradientBoostingClassifier()
}

# Store results
results = []

# Define sampling percentage for Gaussian Process Classifier due to computational needs
gpc_sample_pct = 0.10
print("\nModels defined")


Models defined


**Models Execution**

In [None]:
# Iterate through datasets and models
for dataset_name, (X_tr, X_te) in datasets.items():
    for model_name, model in models.items():
        start_time = time.time()

        # If Gaussian Process Classifier, sample 10% of the data while keeping class balance
        if model_name == "Gaussian Process Classifier":
            X_tr_sampled, y_tr_sampled = resample(
                X_tr, y_train,
                replace=False,
                stratify=y_train,
                n_samples=int(len(X_tr) * gpc_sample_pct),
                random_state=42
            )
        else:
            X_tr_sampled, y_tr_sampled = X_tr, y_train

        # Train model
        print(f"Training {model_name} on {dataset_name}...")
        model.fit(X_tr_sampled, y_tr_sampled)
        y_pred = model.predict(X_te)
        y_prob = model.predict_proba(X_te)[:, 1] if hasattr(model, "predict_proba") else None

        # Evaluate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
        execution_time = time.time() - start_time

        # Store result
        results.append({
            "Model": f"{model_name}",
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "AUC-ROC": auc,
            "Time (s)": execution_time
        })

Training Gaussian Process Classifier on PCA...
Training Logistic Regression on PCA...
Training Random Forest on PCA...
Training Support Vector Machine on PCA...
Training Gradient Boosting Classifier on PCA...


**Model Results**

# **Initial Results**

In [None]:
# Convert to DataFrame
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv("model_evaluation_results.csv", index=False)

print("Model evaluation complete. Results saved to 'model_evaluation_results.csv'.")

Model evaluation complete. Results saved to 'model_evaluation_results.csv'.


# **Hyperparmeter Tuning on Select Models**


**Grid Search for Logistic Regression & Random Forest**

In [None]:
# Define parameter grids
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Type of regularization
    'solver': ['liblinear']  # Solver for small datasets
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [None, 10, 20],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]  # Minimum samples per leaf
}

# Initialize models
lr = LogisticRegression()
rf = RandomForestClassifier()

# Apply Grid Search
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit models (replace X_train, y_train with your dataset variables)
grid_search_lr.fit(X_train_pca, y_train)
grid_search_rf.fit(X_train_pca, y_train)

# Best parameters & scores
print("Best Logistic Regression Parameters:", grid_search_lr.best_params_)
print("Best Logistic Regression AUC-ROC:", grid_search_lr.best_score_)
print("Best Random Forest Parameters:", grid_search_rf.best_params_)
print("Best Random Forest AUC-ROC:", grid_search_rf.best_score_)


Best Logistic Regression Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Best Logistic Regression AUC-ROC: 0.8035975544746566
Best Random Forest Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Random Forest AUC-ROC: 0.8053679003488252


**Bayesian Optimization for SVM & Gradient Boosting**

In [None]:
# Define search spaces
param_space_svm = {
    'C': (0.01, 100, 'log-uniform'),  # Regularization strength
    'gamma': (0.0001, 1, 'log-uniform'),  # Kernel coefficient
    'kernel': ['linear', 'rbf', 'poly']  # Kernel type
}

param_space_gbc = {
    'learning_rate': (0.01, 0.2, 'log-uniform'),  # Step size
    'n_estimators': (50, 500),  # Trees in boosting
    'max_depth': (3, 10),  # Tree depth
    'subsample': (0.5, 1.0),  # Fraction of samples used
    'min_samples_split': (2, 10)  # Minimum samples to split
}

# Initialize models
svm = SVC(probability=True)  # For AUC-ROC
gbc = GradientBoostingClassifier()

# Apply Bayesian Optimization
bayes_search_svm = BayesSearchCV(svm, param_space_svm, cv=5, scoring='roc_auc', n_iter=30, n_jobs=-1)
bayes_search_gbc = BayesSearchCV(gbc, param_space_gbc, cv=5, scoring='roc_auc', n_iter=30, n_jobs=-1)

# Fit models
print("Fitting SVM...")
bayes_search_svm.fit(X_train_pca, y_train)
print("Fitting Gradient Boosting Classifier...")
bayes_search_gbc.fit(X_train_pca, y_train)

# Best parameters & scores
print("Best SVM Parameters:", bayes_search_svm.best_params_)
print("Best SVM AUC-ROC:", bayes_search_svm.best_score_)
print("Best GBC Parameters:", bayes_search_gbc.best_params_)
print("Best GBC AUC-ROC:", bayes_search_gbc.best_score_)

Fitting SVM...


# **Post Tuning Results**

In [None]:
# Retrieve best hyperparameters
best_params_lr = grid_search_lr.best_params_
best_params_rf = grid_search_rf.best_params_
best_params_svm = bayes_search_svm.best_params_
best_params_gbc = bayes_search_gbc.best_params_

print("Best Logistic Regression Parameters:", best_params_lr)
print("Best Random Forest Parameters:", best_params_rf)
print("Best SVM Parameters:", best_params_svm)
print("Best Gradient Boosting Classifier Parameters:", best_params_gbc)

# Define optimized models with best parameters
best_lr = LogisticRegression(**best_params_lr)
best_rf = RandomForestClassifier(**best_params_rf)
best_svm = SVC(probability=True, **best_params_svm)
best_gbc = GradientBoostingClassifier(**best_params_gbc)

# Train models on the full training dataset
best_lr.fit(X_train_pca, y_train)
best_rf.fit(X_train_pca, y_train)
best_svm.fit(X_train_pca, y_train)
best_gbc.fit(X_train_pca, y_train)

# Test models on test data
y_pred_lr = best_lr.predict(X_test_pca)
y_pred_rf = best_rf.predict(X_test_pca)
y_pred_svm = best_svm.predict(X_test_pca)
y_pred_gbc = best_gbc.predict(X_test_pca)

# Get probability scores for AUC-ROC
y_prob_lr = best_lr.predict_proba(X_test_pca)[:, 1]
y_prob_rf = best_rf.predict_proba(X_test_pca)[:, 1]
y_prob_svm = best_svm.predict_proba(X_test_pca)[:, 1]
y_prob_gbc = best_gbc.predict_proba(X_test_pca)[:, 1]

# Evaluate model performance
results = []

def evaluate_model(name, y_test, y_pred, y_prob):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    return {
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "AUC-ROC": auc
    }

# Store results for both models
results.append(evaluate_model("Optimized Logistic Regression", y_test, y_pred_lr, y_prob_lr))
results.append(evaluate_model("Optimized Random Forest", y_test, y_pred_rf, y_prob_rf))
results.append(evaluate_model("Optimized SVM", y_test, y_pred_svm, y_prob_svm))
results.append(evaluate_model("Optimized Gradient Boosting Classifier", y_test, y_pred_gbc, y_prob_gbc))

# Convert results to DataFrame
df_results = pd.DataFrame(results)

# Save results to CSV
df_results.to_csv("optimized_model_results.csv", index=False)

print("\nOptimized models evaluated and results saved to 'optimized_model_results.csv'!")