<a href="https://colab.research.google.com/github/akpanitorobong/7135CEM-Modelling_and_Optimization_Under_Uncertainty/blob/main/7135CEM_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Imports, Dataset Loading and Preprocessing**

**Necessary Imports**

In [11]:
print("Importing necessary libraries...")
!pip install ucimlrepo
!pip install pgmpy

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import seaborn as sns  # For data visualization
import matplotlib.pyplot as plt  # For plotting graphs

from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from ucimlrepo import fetch_ucirepo # For dataset import
from sklearn.model_selection import train_test_split, cross_val_score  # For splitting data and cross-validation
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For feature scaling and encoding categorical variables
from sklearn.ensemble import RandomForestClassifier  # Random Forest model for classification
from sklearn.linear_model import LogisticRegression  # Logistic Regression model for baseline comparison
from sklearn.gaussian_process import GaussianProcessClassifier  # Gaussian Process Classification model
from sklearn.gaussian_process.kernels import RBF, Matern  # RBF and Matérn kernels for Gaussian Process
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix  # Evaluation metrics
#Dataset balancing
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler

print("Libraries imported successfully.")

Importing necessary libraries...
Libraries imported successfully.


**Load Dataset**

In [4]:
print("Loading dataset...")
# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets
df = pd.concat([X, y], axis=1)
df.to_csv('diabetes_health_indicators.csv', index=False)
print("Dataset loaded successfully.")
df.head()

Loading dataset...
Dataset loaded successfully.


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


**Preprocessing**

In [5]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check for duplicates
print("\nDuplicate rows:")
print(df.duplicated().sum())

df.drop_duplicates(inplace=True)

print("\nDuplicate rows removed.")
print("\nDuplicate rows:")
print(df.duplicated().sum())

Missing values:
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
Diabetes_binary         0
dtype: int64

Duplicate rows:
24206

Duplicate rows removed.

Duplicate rows:
0


# **Exploratory Data Analysis, Dataset Balancing and Splitting**

**EDA**

In [None]:
print("Visualizing data distribution...")
#Countplot for Target Variable
sns.countplot(x= df['Diabetes_binary'])
plt.xlabel("Diabetes Binary")
plt.ylabel("Count")
plt.title("Diabetes Distribution")
plt.show()

# Generate correlation heatmap to check relationships between features
plt.figure(figsize=(6, 6))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

**Scaling, Balancing(Undersampling) and Splitting**

In [None]:
# Feature scaling using StandardScaler to normalize data
scaler = StandardScaler()
features = df.drop(columns=['Diabetes_binary'])  # Input features
target = df['Diabetes_binary']  # Target variable
X_scaled = scaler.fit_transform(features)  # Apply scaling to features
print("Features scaled successfully.")

# Handling class imbalance using Random Under-Sampling
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_scaled, target)
print("\nClass imbalance handled successfully. \nNew class distribution:", Counter(y_resampled))

# Define sample size
sample_size = 10000

# Stratified sampling to maintain class balance
X_sampled, _, y_sampled, _ = train_test_split(
    X_resampled, y_resampled, train_size=sample_size, stratify=y_resampled, random_state=42
)
print("\nData sampled successfully. \nNew distr size:", Counter(y_sampled))

# Now split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

print("\nDataset split successfully.")

# **Models**

**Model Selection and Implementation**

In [35]:
# 1. Gaussian Process Classification (GPC)
kernel = 1.0 * Matern(length_scale=1.5, nu=1.5)  # Using Matérn kernel for efficiency
gpc = GaussianProcessClassifier(kernel=kernel, random_state=42)
gpc.fit(X_train, np.ravel(y_train))
gpc_preds = gpc.predict(X_test)
gpc_proba = gpc.predict_proba(X_test)[:, 1]
print("GPC Accuracy:", accuracy_score(y_test, gpc_preds))
print("GPC AUC Score:", ro_auc_score(y_test, gpc_proba))
print("GPC F1 Score:", f1_score(y_test, gpc_preds))

GPC Accuracy: 0.7385


NameError: name 'ro_auc_score' is not defined

In [37]:
print("GPC Accuracy:", accuracy_score(y_test, gpc_preds))
#print("GPC AUC Score:", ro_auc_score(y_test, gpc_proba))
print("GPC F1 Score:", f1_score(y_test, gpc_preds))

GPC Accuracy: 0.7385
GPC F1 Score: 0.7474649927571222


In [19]:
# 3. Random Forest (Baseline Model)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print("Random Forest F1:", f1_score(y_test, rf_preds))

  return fit_method(estimator, *args, **kwargs)


Random Forest Accuracy: 0.86
Random Forest F1: 0.0


In [1]:
# 5. Gaussian Process Regression (GPR) for Risk Score Prediction
y_continuous = np.random.rand(len(y))  # Placeholder for risk scores
gpr = GaussianProcessRegressor(kernel=1.0 * RBF(length_scale=1.0))
gpr.fit(X_train, y_continuous[:len(X_train)])
gpr_preds = gpr.predict(X_test)
print("GPR Predictions:", gpr_preds[:5])

NameError: name 'np' is not defined