Importing Libraries

In [27]:
pip install kmodes



In [28]:
import numpy as np
import pandas as pd
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from kmodes.kmodes import KModes
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading the data

In [30]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cardio_train.csv', sep=';')

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


Checking for missing values

In [32]:
print(df.isnull().sum())

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


In [33]:
#drop id
df = df.drop('id', axis=1)

# print(df)

# **Removing Outliers:**

It is important to remove outliers to improve the performance of our prediction models. We have removed outliers that fall outside the range of 2.5% to 97.5% in all instances of ap_hi, ap_lo, weight, and height features. This process has decreased the entries in the data set from 70,000 to 60,142 records.

In [34]:
df.drop(df[(df['height'] > df['height'].quantile(0.975)) | (df['height'] < df['height'].quantile(0.025))].index,inplace=True)
df.drop(df[(df['weight'] > df['weight'].quantile(0.975)) | (df['weight'] < df['weight'].quantile(0.025))].index,inplace=True)
df.drop(df[(df['ap_hi'] > df['ap_hi'].quantile(0.975)) | (df['ap_hi'] < df['ap_hi'].quantile(0.025))].index,inplace=True)
df.drop(df[(df['ap_lo'] > df['ap_lo'].quantile(0.975)) | (df['ap_lo'] < df['ap_lo'].quantile(0.025))].index,inplace=True)
len(df)

60142

How many cases where diastolic pressure is higher than systolic?

In [35]:
df[df['ap_lo']> df['ap_hi']].shape[0]

0

In [36]:
#after removing outliers
df.describe()


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0
mean,19468.719979,1.347311,164.554854,73.426805,125.770526,81.046307,1.350953,1.220229,0.085631,0.051877,0.803648,0.488228
std,2460.510296,0.47612,6.830174,11.614806,13.761847,8.239157,0.670076,0.567607,0.27982,0.221781,0.397241,0.499866
min,10798.0,1.0,150.0,52.0,100.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17677.25,1.0,160.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19705.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21321.0,2.0,169.0,80.0,135.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,180.0,106.0,163.0,100.0,3.0,3.0,1.0,1.0,1.0,1.0


Transformation:
Converting age from days to years

In [37]:
df['age'] = (df['age'] / 365).round().astype('int')

print(df.head())

   age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  \
0   50       2     168    62.0    110     80            1     1      0     0   
1   55       1     156    85.0    140     90            3     1      0     0   
2   52       1     165    64.0    130     70            3     1      0     0   
3   48       2     169    82.0    150    100            1     1      0     0   
4   48       1     156    56.0    100     60            1     1      0     0   

   active  cardio  
0       1       0  
1       1       1  
2       0       1  
3       1       1  
4       0       0  


Categorizing features

In [38]:
# # Define the bin edges and labels
age_edges = [30, 35, 40, 45, 50, 55, 60, 65]
age_labels = [0, 1, 2, 3, 4, 5, 6]

#  bin in  5 years span
df['age_group'] = pd.cut(df['age'], bins=7, labels=range(7), include_lowest=True,right=True)
df.head()


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_group
0,50,2,168,62.0,110,80,1,1,0,0,1,0,3
1,55,1,156,85.0,140,90,3,1,0,0,1,1,4
2,52,1,165,64.0,130,70,3,1,0,0,0,1,4
3,48,2,169,82.0,150,100,1,1,0,0,1,1,3
4,48,1,156,56.0,100,60,1,1,0,0,0,0,3


**Attribute Combination**

It is important to combine some attributes into more meaningful ones. For example, using Body Mass Index (BMI) instead of the features weight and height individually, is more useful. Therefore, we have added Body Mass Index (BMI) and Mean Arterial Pressure (MAP) to the data.

In [39]:
df['bmi'] = df['weight']/((df['height']/100)**2)
df.head()

bmiMin = int(df['bmi'].min())
bmiMax = int(df['bmi'].max())

print(bmiMin, bmiMax)

df['bmi'] = pd.cut(df['bmi'], bins=6, labels=range(6), right=True, include_lowest=True)

df.head()


df["bmi"].value_counts(normalize=True)

16 46


bmi
1    0.461325
2    0.330202
3    0.133068
0    0.038193
4    0.033554
5    0.003658
Name: proportion, dtype: float64

In [40]:
df['map'] = ((2* df['ap_lo']) + df['ap_hi']) / 3

mapMin = int(df['map'].min())
mapMax = int(df['map'].max())

print(mapMin, mapMax)

df['map'] = pd.cut(df['map'], bins=6, labels=range(6), right=True, include_lowest=True)

df.head()

73 121


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_group,bmi,map
0,50,2,168,62.0,110,80,1,1,0,0,1,0,3,1,2
1,55,1,156,85.0,140,90,3,1,0,0,1,1,4,3,4
2,52,1,165,64.0,130,70,3,1,0,0,0,1,4,1,2
3,48,2,169,82.0,150,100,1,1,0,0,1,1,3,2,5
4,48,1,156,56.0,100,60,1,1,0,0,0,0,3,1,0


Print Null rows

In [41]:
null_rows = df[df.isnull().any(axis=1)]
print("Rows with null values:")
print(null_rows)

Rows with null values:
Empty DataFrame
Columns: [age, gender, height, weight, ap_hi, ap_lo, cholesterol, gluc, smoke, alco, active, cardio, age_group, bmi, map]
Index: []


**Drop features**
We only need categorical data

In [42]:
df_og=df

df=df.drop(['height','weight','ap_hi','ap_lo','age'],axis=1)

df.head()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_group,bmi,map
0,2,1,1,0,0,1,0,3,1,2
1,1,3,1,0,0,1,1,4,3,4
2,1,3,1,0,0,0,1,4,1,2
3,2,1,1,0,0,1,1,3,2,5
4,1,1,1,0,0,0,0,3,1,0


**Label Encoder**

In [43]:
le = preprocessing.LabelEncoder()
df = df.apply(le.fit_transform)
df.describe()

Unnamed: 0,gender,cholesterol,gluc,smoke,alco,active,cardio,age_group,bmi,map
count,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0,60142.0
mean,0.347311,0.350953,0.220229,0.085631,0.051877,0.803648,0.488228,4.042233,1.67344,2.359449
std,0.47612,0.670076,0.567607,0.27982,0.221781,0.397241,0.499866,1.37707,0.898707,1.186906
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,2.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,2.0,2.0
75%,1.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,2.0,3.0
max,1.0,2.0,2.0,1.0,1.0,1.0,1.0,6.0,5.0,5.0


# **Clustering**

Clustering is used in machine learning to find similarities between data by grouping.  The most common technique for clustering is K-means. However, it is not effective for categorical data. K-means uses a Euclidean distance measure differences between data points. For our data, we have used **K-modes** which is the appropriate clustering algorithm for categorical data because it uses mode-based distance between the categories, so it is more suitable for categorical data. To find the optimal number of clusters, we have utilized the **elbow curve method**.

In [None]:
cost = []
num_clusters = range(1,6) # 1 to 5
for i in list(num_clusters):
    kmode = KModes(n_clusters=i, init = "Huang", n_init = 5, verbose=0,random_state=1)
    kmode.fit_predict(df)
    cost.append(kmode.cost_)


**Clusters graph**

In [None]:
plt.plot(num_clusters, cost, 'bo-')
plt.xlabel('num_clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal Number of Clusters')
plt.show()

Building KModes model

In [None]:
km = KModes(n_clusters=2, init = "Huang", n_init = 5,random_state=1)
clusters = km.fit_predict(df)
clusters

Adding clusters column in DF

In [None]:
df.insert(0,"clusters",clusters,True)

df.head()

# **Correlation Matrix**

In [None]:
# Set up figure
plt.figure(figsize=(10, 8))

# Draw correlation matrix
sns.heatmap(df.corr(), annot=True, cmap='Spectral', fmt=".2f", linewidths=.5)

# Show the figure
plt.title('Correlation Matrix')
plt.show()

'gender' has a correlation of 0 to our target 'cardio', and 'smoke' has a correlation of '0.01'.
We will remove those features to increase performance.

Distribution of cardio in clusters

In [None]:
sns.countplot(x='clusters', hue='cardio', data=df)
plt.title('Distribution of Cardiovascular Disease within Clusters')
plt.show()




# **Split Data**

Training set: 80%

Test set: 20%

According to the correlation table, gender has 0 correlation with our target. Moreover, ‘alco’ has 0.01 correlation. Therefore, we dropped those two features and saw an increase in the performance of our models.

In [None]:
x = df.drop(['cardio','gender','alco'], axis=1)
y = df['cardio']

x.head()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=1)

In [None]:
x_train.info()

# **MLP**

Without CV

In [None]:
# build MLP model
mlpModel = MLPClassifier(random_state=1)

# Fit the model
mlpModel.fit(x_train, y_train)

# Make predictions
mlp_pred = mlpModel.predict(x_test)

# accuracy
mlp_accuracy = metrics.accuracy_score(y_test, mlp_pred)*100
print(f"Accuracy without CV: {mlp_accuracy:.2f}")

In [None]:
# mlp_params = {
#     'hidden_layer_sizes': [(100,), (50, 50), (100, 50, 25)],
#     'activation': ['relu', 'tanh'],
#     'solver': ['adam'],
#     'max_iter': [100, 200, 300],
#     'alpha': [0.0001, 0.001, 0.01],
# }

# Best parameters for MLP
mlp_best_params = {
    'activation': ['tanh'],
    'alpha': [0.01],
    'hidden_layer_sizes': [(50, 50)],
    'max_iter': [300],
    'solver': ['adam'],
}

# Create grid search
mlp_gridsearch = GridSearchCV(estimator=mlpModel, param_grid=mlp_best_params, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search
mlp_gridsearch.fit(x_train, y_train)

Get Best parameters and best estimator for MLP from GridSearchCV

In [None]:
# mlp_best_params = mlp_gridsearch.best_params_
mlp_best_estimator = mlp_gridsearch.best_estimator_

print(f"Best Parameters : {mlp_best_params}")
print(f"Best Estimator  : {mlp_best_estimator}")

Make prediction using best estimator

In [None]:
mlp_pred_CV = mlp_best_estimator.predict(x_test)

Accuracy

In [None]:
mlp_accuracy_cv = metrics.accuracy_score(y_test, mlp_pred_CV)*100
print(f"Best Accuracy: {mlp_accuracy_cv:.2f}")

**MLP** Accuracy Scores

In [None]:
print(f"MLP accuracy without CV : {mlp_accuracy:.2f}")
print(f"MLP accuracy with CV    : {mlp_accuracy_cv:.2f}")

**Classification Report**

In [None]:
classification_report_str = classification_report(y_test, mlp_pred_CV, digits=4)

print("Classification Report for MLP with CV:\n", classification_report_str)

**Accuracy of all three models**

In [None]:

print(f"MLP accuracy with CV   : {mlp_accuracy_cv:.2f}")

**Visualization**

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, mlp_pred_CV)
cnf_matrix

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix: MLP', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
x_test.head()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

# Convert categorical features to one-hot encoding
x_train_encoded = pd.get_dummies(x_train)

# Convert y_train to categorical format
y_train_categorical = to_categorical(y_train)

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train_encoded, y_train_categorical, test_size=0.2, random_state=42)

# Build the Sequential model
model = Sequential()
model.add(Dense(64, input_shape=(x_train.shape[1],), activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='sigmoid'))  # 2 output neurons for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

# Evaluate the model on the validation set
val_loss, val_acc = model.evaluate(x_val, y_val)
print("Validation Loss:", val_loss)
print("Validation Accuracy:", val_acc)


In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')

plt.legend(['training data', 'validation data'], loc = 'lower right')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['training data', 'validation data'], loc = 'upper right')

In [None]:
# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(x_val, y_val)

# Print the validation accuracy
print("Validation Accuracy:", val_accuracy)


In [None]:
!pip install eli5
!pip install pdpbox
!pip install shap

In [None]:
import numpy as np

x_train_numpy = x_train.to_numpy()
x_val_numpy = x_val.to_numpy()


In [None]:
import shap
# Reset the index of x_val to ensure continuous integer indexing
x_val.reset_index(drop=True, inplace=True)

# Convert x_val to a numpy array
x_val_numpy = x_val.to_numpy()
# Explain model predictions using SHAP with a specified number of background samples
explainer = shap.DeepExplainer(model, x_train_numpy[:100])  # Use the first 100 samples as background
shap_values = explainer.shap_values(x_val_numpy)






In [None]:
print(shap_values)

In [None]:
import shap
import matplotlib.pyplot as plt

# Assuming x_val is your validation dataset with feature names
# Assuming shap_values is the SHAP values you provided
# Assuming feature_names is a list containing the names of your features

# Plot the SHAP summary plot
feature_names= ['clusters', 'cholesterol', 'gluc', 'smoke', 'active', 'age_group', 'bmi', 'map']
import shap
import matplotlib.pyplot as plt
import pandas as pd

# Convert x_val_numpy to a DataFrame with the appropriate feature names
x_val_df = pd.DataFrame(x_val_numpy, columns=feature_names)

# Plot the SHAP summary plot
shap.summary_plot(shap_values, features=x_val_df)

# Show the plot
plt.show()


In [None]:
#bar plot
shap.summary_plot(shap_values, x_val_numpy, plot_type="bar")


In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1][sample_index], x_val.iloc[sample_index,:])


In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.base import BaseEstimator

# Define a wrapper class for Keras model
class KerasEstimator(BaseEstimator):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        pass  # Keras models are already trained

    def predict(self, X):
        return self.model.predict(X)

    def score(self, X, y):
        # Dummy score method to satisfy PermutationImportance requirement
        return 0

# Define a custom function to calculate permutation importance for Keras model
def permutation_importance_keras(model, x, y, feature_names):
    perm = PermutationImportance(model, random_state=123).fit(x, y)
    return eli5.show_weights(perm, feature_names=feature_names, top=24)

# Create a wrapper instance for the Keras model
keras_estimator = KerasEstimator(model)

# Generate permutation importance for the Keras model
perm_importance = permutation_importance_keras(keras_estimator, x_test, y_test, feature_names=x.columns.tolist())

# Display permutation importance
print(perm_importance)


In [None]:
from IPython.display import display

# Display permutation importance
display(perm_importance)


In [None]:
features = [c for c in x_test.columns]

from pdpbox import pdp, info_plots

pdp_resting_bp = pdp.pdp_isolate(model=rfModel, dataset=x_test, model_features=features, feature='map')

# plot it
pdp.pdp_plot(pdp_resting_bp, 'map')

plt.show()

In [None]:
def plot_pdp(model, df, feature, cluster_flag=False, nb_clusters=None, lines_flag=False):

    # Create the data that we will plot
    pdp_goals = pdp.pdp_isolate(model=model, dataset=df, model_features=df.columns.tolist(), feature=feature)

    # plot it
    pdp.pdp_plot(pdp_goals, feature, cluster=cluster_flag, n_cluster_centers=nb_clusters, plot_lines=lines_flag)
    plt.show()

plot_pdp(rf_gridsearch, x_train, 'cholesterol', cluster_flag=True, nb_clusters=24, lines_flag=True)


In [None]:
inter1  =  pdp.pdp_interact(model=rf_gridsearch, dataset=x_test, model_features=features, features=['age', 'map'])

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['age_group', 'map'], plot_type='contour')
plt.show()

In [None]:
x_train

In [None]:
import lime
import lime.lime_tabular
from sklearn.model_selection import train_test_split
feature_names = ['clusters', 'cholesterol', 'gluc', 'smoke', 'active', 'age_group', 'bmi', 'map']
# Create a LIME explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(x_train_numpy, mode="classification", feature_names=feature_names)

# Choose a specific instance from the validation set
instance_index = 0
instance = x_val.iloc[instance_index]

# Generate explanations for the instance
explanation = explainer.explain_instance(instance, rfModel.predict_proba, num_features=8)

# Visualize the explanations
explanation.show_in_notebook()


In [None]:
explainer = shap.TreeExplainer(rfModel)
shap_values = explainer.shap_values(x_test)

shap.summary_plot(shap_values[1], x_test, plot_type="bar")

In [None]:
shap.summary_plot(shap_values[1], x_test)