In [1]:
# Import necessary libraries
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import keras_tuner as kt
import shap
import matplotlib.pyplot as plt

# Load and preprocess data
data = pd.read_csv('../2 - Data/develop.csv')  # Update the path to your dataset
target = 'Ins'  # Define the target variable

# Preprocess data
X = pd.get_dummies(data.drop(columns=[target]), drop_first=True)
y = data[target]

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Define model-building function with tuning
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32), activation='relu', input_dim=X_train.shape[1]))
    for i in range(hp.Int('num_layers', 1, 4)):  # Up to 4 hidden layers
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Hyperparameter tuning with Keras Tuner
tuner = kt.BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=2,
    directory='ktuner_dir',
    project_name='DNN_tuning'
)

# Perform hyperparameter search with increased epochs
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model on the test set
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Save the best model
best_model.save('best_dnn_model_with_xai.h5')

# Model summary
best_model.summary()

# SHAP analysis to explain DNN predictions
explainer = shap.DeepExplainer(best_model, X_train[:100])  # Limiting to first 100 samples for performance
shap_values = explainer.shap_values(X_test[:100])

# SHAP summary plot
print("\nSHAP Summary Plot for DNN")
shap.summary_plot(shap_values[0], X_test[:100], feature_names=X.columns)

# SHAP force plot for individual predictions
print("\nSHAP Force Plot for first prediction in test set")
shap.force_plot(explainer.expected_value[0], shap_values[0][0], X_test[:100][0], feature_names=X.columns)
plt.show()

Trial 20 Complete [00h 01m 41s]
val_accuracy: 0.7924136817455292

Best val_accuracy So Far: 0.7930063605308533
Total elapsed time: 00h 30m 05s
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379us/step - accuracy: 0.7917 - loss: 0.4310


  saveable.load_own_variables(weights_store.get(inner_path))


Test Accuracy: 0.7960





SHAP Summary Plot for DNN


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [2]:
model_json = best_model.to_json()
with open("best_dnn_model_architecture.json", "w") as json_file:
    json_file.write(model_json)