Exploring a dataset with information about patients. 
Consider next 8 features to predict the mortality rate:
1. Age
2. ОССН KiLLip
3. HBR (b)
4. Systolic AP(b)
5. Creatine in blood
6. EF Percentage
7. White blood cells count
8. Glucose

1. Import required libraries and load the dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

import shap

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
data = pd.read_excel('./import/DataSet_V47.xlsx')
X = data.drop(columns='mortality_rate')
y = data['mortality_rate']

2. Split the dataset into train, fit, and test sets

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_fit, X_test, y_fit, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


3. Data preprocessing pipeline

In [None]:
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor.fit(X_train)

X_train_preprocessed = preprocessor.transform(X_train)
X_fit_preprocessed = preprocessor.transform(X_fit)
X_test_preprocessed = preprocessor.transform(X_test)

4. Linear Regression models (Univariative and Multivariate)

In [None]:
# Univariative Linear Regression
for feature_name in X.columns:
    lr = LinearRegression()
    lr.fit(X_train_preprocessed[:, [X.columns.get_loc(feature_name)]], y_train)
    
    # Feature weight estimation
    print(f"Feature weight for {feature_name}: {lr.coef_[0]}")

# Multivariate Linear Regression
lr_multi = LinearRegression()
lr_multi.fit(X_train_preprocessed, y_train)

# Feature weight estimation
for feature_name, weight in zip(X.columns, lr_multi.coef_):
    print(f"Feature weight for {feature_name}: {weight}")

# Shapley values for Multivariate Linear Regression
explainer = shap.Explainer(lr_multi, X_train_preprocessed, feature_names=X.columns)
shap_values = explainer(X_fit_preprocessed)
shap.summary_plot(shap_values, X_fit, plot_type='bar')
plt.title('Shapley Values for Multivariate Linear Regression')
plt.show()


5. Gradient Boosting model

In [None]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train_preprocessed, y_train)

# Feature importance via impurity
feature_importances_gb = gb.feature_importances_

# Permutation importance
result_gb = permutation_importance(gb, X_fit_preprocessed, y_fit, n_repeats=10, random_state=42)
perm_importances_gb = result_gb.importances_mean

# Shapley values for Gradient Boosting
explainer = shap.Explainer(gb, X_train_preprocessed, feature_names=X.columns)
shap_values = explainer(X_fit_preprocessed)
shap.summary_plot(shap_values, X_fit, plot_type='bar')
plt.title('Shapley Values for Gradient Boosting')
plt.show()

6. Random Forest model

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_preprocessed, y_train)

# Feature importance via impurity
feature_importances_rf = rf.feature_importances_

# Permutation importance
result_rf = permutation_importance(rf, X_fit_preprocessed, y_fit, n_repeats=10, random_state=42)
perm_importances_rf = result_rf.importances_mean

# Shapley values for Random Forest
explainer = shap.Explainer(rf, X_train_preprocessed, feature_names=X.columns)
shap_values = explainer(X_fit_preprocessed)
shap.summary_plot(shap_values, X_fit, plot_type='bar')
plt.title('Shapley Values for Random Forest')
plt.show()

7. Deep Neural Network model

In [None]:
def create_dnn(input_shape, num_layers, units_per_layer, l1_l2_reg, dropout_rate):
    model = Sequential()
    model.add(Dense(units_per_layer, input_shape=input_shape, activation='relu', kernel_regularizer=L1L2(*l1_l2_reg)))
    model.add(Dropout(dropout_rate))

    for _ in range(num_layers - 1):
        model.add(Dense(units_per_layer, activation='relu', kernel_regularizer=L1L2(*l1_l2_reg)))
        model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

def dnn_cross_val_score(X, y, num_layers, units_per_layer, l1_l2_reg, dropout_rate, n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_index, val_index in kfold.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        preprocessor.fit(X_train)
        X_train_preprocessed = preprocessor.transform(X_train)
        X_val_preprocessed = preprocessor.transform(X_val)

        dnn = create_dnn((X_train_preprocessed.shape[1],), num_layers, units_per_layer, l1_l2_reg, dropout_rate)
        
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        dnn.fit(X_train_preprocessed, 
                y_train, 
                validation_data=(X_val_preprocessed, y_val), 
                epochs=100, 
                batch_size=32, 
                verbose=0, 
                callbacks=[early_stopping]
                )

        score = dnn.evaluate(X_val_preprocessed, y_val, verbose=0)
        scores.append(score)

    return np.mean(scores)

num_layers = 2
units_per_layer = 64
l1_l2_reg = (1e-5, 1e-5)
dropout_rate = 0.2
mean_score = dnn_cross_val_score(X, y, num_layers, units_per_layer, l1_l2_reg, dropout_rate)
print(f"Mean score for {num_layers} layers, {units_per_layer} units per layer, L1/L2 reg = {l1_l2_reg}, dropout rate = {dropout_rate}: {mean_score}")

In [None]:
dnn = create_dnn((X_train_preprocessed.shape[1],), num_layers, units_per_layer, l1_l2_reg, dropout_rate)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
dnn.fit(X_train_preprocessed, y_train, validation_data=(X_fit_preprocessed, y_fit), epochs=100, batch_size=32, callbacks=[early_stopping])

# Shapley values for Deep Neural Network
explainer = shap.Explainer(dnn, X_train_preprocessed, feature_names=X.columns)
shap_values = explainer(X_fit_preprocessed)
shap.summary_plot(shap_values, X_fit, plot_type='bar')
plt.title('Shapley Values for Deep Neural Network')
plt.show()

8. Comparison of feature importances

In [None]:
feature_importance_df = pd.DataFrame({'Feature': X.columns,
                                      'Multivariate Linear Regression': lr_multi.coef_,
                                      'Gradient Boosting (impurity)': feature_importances_gb,
                                      'Gradient Boosting (permutation)': perm_importances_gb,
                                      'Random Forest (impurity)': feature_importances_rf,
                                      'Random Forest (permutation)': perm_importances_rf})

feature_importance_df.set_index('Feature', inplace=True)
feature_importance_df.plot(kind='bar', figsize=(12, 6))
plt.title('Feature Importances for Different Models')
plt.ylabel('Importance')
plt.xticks(rotation=45)
plt.show()