In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")

In [None]:
from numpy.random import seed
seed(42)
import tensorflow as tf
tf.random.set_seed(42)
import random
random.seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# reading in csv
file_path = '/content/drive/My Drive/DS340/CHD_men.csv'
male_data = pd.read_csv(file_path, sep=";")

male_data.head(10)

In [None]:
# Building CNN model based off of CNN structure from paper 2
# had to keep padding as "same" - does not match the paper
# also changed from softmax to sigmoid, as sigmoid is better for binary classification
def gender_model(hp):
    model = Sequential()

    # Layer 1: Dense layer w/ 64 neurons, batch normalization, ReLU activation and dropout
    # hyperparameter tuning ==> # of neurons
    model.add(Dense(units = hp.Int('Dense1_neuonrs', min_value = 32, max_value = 512, step = 32),
                    input_shape=(11, 1)))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(hp.Float('Dense1_dropout', min_value = 0.1, max_value = 0.5, step = 0.05)))


    # Layer 2: Convolutional layer w/ 2 filters, kernel size of 4, no padding, 2 strides, Batch Normalization, ReLU and average pooling
    model.add(Conv1D(filters=2,
                     kernel_size= 4,
                     strides = 2,
                     padding = 'same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    #model.add(GlobalAveragePooling1D())


    # Layer 3: Convolutional layer w/ 4 filters, kernel size of 4, no padding, 2 strides, Batch Normalization, ReLU and average pooling
    model.add(Conv1D(filters=4,
                     kernel_size = 6,
                     strides = 2,
                     padding = 'same'))
    model.add(BatchNormalization())
    model.add(ReLU())
    #model.add(GlobalAveragePooling1D())

    # had to add because of errors
    model.add(Flatten())

    # Layer 4: Dense layer with 512 neurons, Batch normalization, ReLU and dropout
    # Hyperparameter tuning ==> # of neurons
    model.add(Dense(units = hp.Int('Dense2_neurons', min_value = 32, max_value = 512, step = 32)))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(hp.Float('Dense2_dropout', min_value = 0.1, max_value = 0.5, step = 0.05)))

    # Layer 5: Dense layer with 1 neuron
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
X_male = male_data.drop(columns=['gender', 'cardio'])
y_male = male_data['cardio']

X_train_full, X_test, y_train_full, y_test = train_test_split(X_male, y_male, test_size=0.2, random_state=42, stratify=y_male)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=y_train_full)

In [None]:
scaler_male = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [None]:
# Instantiate Keras Tuner
tuner = RandomSearch(gender_model, objective='val_accuracy', max_trials=3, executions_per_trial=3,
                          directory='./DS340_CHD', project_name='tuner_male')


# Search for the best hyperparameters
tuner.search(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

# Get the best model and evaluate on the test set
best_model = tuner.get_best_models(num_models=1)[0]
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f'Test Loss (after tuning): {test_loss}, Test Accuracy (after tuning): {test_acc}')

In [None]:
def build_best_model():
    model = tuner.hypermodel.build(best_hps)
    return model

model = KerasClassifier(build_fn=build_best_model, epochs=20, batch_size=32, verbose=0)

# Now fit the bagging model
bagging_model = BaggingClassifier(estimator=model, n_estimators=10, random_state=42)
# Flatten X_train for the BaggingClassifier
X_train_flat = X_train.reshape(X_train.shape[0], -1)

# Fit bagging with flattened data
bagging_model.fit(X_train_flat, y_train)

In [None]:
# Step 8: Evaluate the model on the test set
from sklearn.metrics import accuracy_score

# Predict on the test data
y_pred = bagging_model.predict(X_test.reshape(X_test.shape[0], -1))  # Reshape X_test if needed

# Calculate accuracy
test_acc = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {test_acc}')

In [None]:
import shap
X_train_sample = shap.sample(X_train, 50)
X_test_sample = shap.sample(X_test, 50)

In [None]:
X_train_reshaped = X_train_sample.reshape(X_train_sample.shape[0], -1)
X_test_reshaped = X_test_sample.reshape(X_test_sample.shape[0], -1)

In [None]:
import joblib
import os

drive_path = '/content/drive/My Drive/DS340/'  # Replace with your folder name
explainer_file = os.path.join(drive_path, 'shap_explainer_males.pkl')


if os.path.exists(explainer_file):
  explainer = joblib.load(explainer_file)
else:
  # Create SHAP explainer
  explainer = shap.KernelExplainer(bagging_model.predict, X_train_reshaped,approximate = True)

  # Calculate SHAP values (this might take some time)
  shap_values = explainer.shap_values(X_test_reshaped)

  # saving explainer to google drive
  joblib.dump(explainer, explainer_file)

In [None]:
feature_names = ['age', 'height', 'weight', 'sys_bp', 'dia_bp', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
shap.summary_plot(shap_values, X_test_reshaped, feature_names=feature_names, plot_type='bar')

In [None]:
shap.waterfall_plot(shap.Explanation(values=shap_values[0], base_values=explainer.expected_value, data=X_test_reshaped[0], feature_names=feature_names))

In [None]:
shap.decision_plot(explainer.expected_value, shap_values[0], X_test_reshaped, feature_names=feature_names)

In [None]:
!pip install lime
from lime.lime_tabular import LimeTabularExplainer

In [None]:
# doing LIME for first 10 test instances
test_instance = X_test_reshaped[0]  # First test sample

# Generate explanation for the prediction
explanation = lime_explainer.explain_instance(
    data_row=test_instance,
    predict_fn=bagging_model.predict_proba  # Function to predict probabilities
)

explanation.show_in_notebook(show_table=True)

In [None]:
# doing LIME for first 10 test instances
test_instance = X_test_reshaped[10]  # First test sample

# Generate explanation for the prediction
explanation = lime_explainer.explain_instance(
    data_row=test_instance,
    predict_fn=bagging_model.predict_proba  # Function to predict probabilities
)

explanation.show_in_notebook(show_table=True)