In [21]:
# Import all the tools we need

# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import shap



# we want our plots to appear inside the notebook
%matplotlib inline

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
# from sklearn.metrics import plot_roc_curve



In [22]:
df = pd.read_csv("heart_cleveland_upload.csv")
df.shape # (rows, columns)

(297, 14)

In [23]:
# Are there any missing values?
df.isna().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64

In [24]:
# Split data into X and y
X = df.drop("condition", axis=1)

y = df["condition"]

In [25]:
# Split data into train and test sets
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [26]:
# Put models in a dictionary
models = {"Logistic Regression": LogisticRegression()}

# Create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models: a dict of different Scikit-Learn machine learning models
    X_train: training data (no labels)
    X_test: testing data (no labels)
    y_train: training labels
    y_test: test labels
    """
    # set random seed
    # np.random.seed(42)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [27]:
model_scores = fit_and_score(models=models,
                            X_train=X_train,
                            X_test=X_test,
                            y_train=y_train,
                            y_test=y_test)

model_scores


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



{'Logistic Regression': 0.75}

In [28]:
# Create a hyperparameter grid for LogisticRegression
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

In [29]:
# Tune LogisticRegression

np.random.seed(42)

# Setup random hypterparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparamter search model for LogisticRegression
rs_log_reg.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [30]:
# Make predictions with tuned model
y_preds = rs_log_reg.predict(X_test)
y_preds

array([1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0], dtype=int64)

In [31]:
# Create a new classifier with best parameters
clf = LogisticRegression(C=0.08858667904100823,
                         solver="liblinear")

In [32]:
# Cross-validated accuracy
cv_acc = cross_val_score(clf,
                         X,
                         y,
                         cv=5,
                         scoring="accuracy")

cv_acc

array([0.7       , 0.88333333, 0.83050847, 0.94915254, 0.84745763])

In [33]:
import dalex as dx

In [34]:
dir(dx)

['Arena',
 'Aspect',
 'Explainer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'datasets',
 'fairness']

In [35]:

# Explainer Instance
# Model
exp = dx.Explainer(rs_log_reg,X_train,y_train)

Preparation of a new explainer is initiated

  -> data              : 237 rows 13 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 237 values
  -> model_class       : sklearn.model_selection._search.RandomizedSearchCV (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x00000209F94EA320> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0178, mean = 0.46, max = 0.996
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.925, mean = -0.000569, max = 0.898
  -> model_info        : package sklearn

A new explainer has been created!



X does not have valid feature names, but LogisticRegression was fitted with feature names



In [36]:

# Model Performance
exp.model_performance()

Unnamed: 0,recall,precision,f1,accuracy,auc
RandomizedSearchCV,0.825688,0.927835,0.873786,0.890295,0.933271


In [37]:
# Model Performance
exp.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
RandomizedSearchCV,0.825688,0.927835,0.873786,0.890295,0.933271


In [38]:

# Variable/Feature Importance : Features contributions to the model
exp.model_parts()

Unnamed: 0,variable,dropout_loss,label
0,slope,0.065847,RandomizedSearchCV
1,_full_model_,0.066729,RandomizedSearchCV
2,fbs,0.0668,RandomizedSearchCV
3,age,0.068083,RandomizedSearchCV
4,chol,0.068112,RandomizedSearchCV
5,cp,0.068263,RandomizedSearchCV
6,exang,0.068263,RandomizedSearchCV
7,restecg,0.07075,RandomizedSearchCV
8,trestbps,0.075043,RandomizedSearchCV
9,oldpeak,0.076899,RandomizedSearchCV


In [39]:

# Model Variable Importance via Plot
exp.model_parts().plot()

In [40]:

# Model Profile
exp.model_profile()


Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 92.07it/s]


Unnamed: 0,_vname_,_label_,_x_,_yhat_,_ids_
0,age,RandomizedSearchCV,29.00,0.514161,0
1,age,RandomizedSearchCV,29.48,0.513145,0
2,age,RandomizedSearchCV,29.96,0.512130,0
3,age,RandomizedSearchCV,30.44,0.511116,0
4,age,RandomizedSearchCV,30.92,0.510102,0
...,...,...,...,...,...
1308,thal,RandomizedSearchCV,1.92,0.580641,0
1309,thal,RandomizedSearchCV,1.94,0.582824,0
1310,thal,RandomizedSearchCV,1.96,0.585006,0
1311,thal,RandomizedSearchCV,1.98,0.587188,0


In [41]:

# Plot Model Profile
exp.model_profile().plot()

Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 92.38it/s]


In [44]:

### Make A Prediction
ex1 = X_test.iloc[7]
ex1

age          70.0
sex           1.0
cp            3.0
trestbps    130.0
chol        322.0
fbs           0.0
restecg       2.0
thalach     109.0
exang         0.0
oldpeak       2.4
slope         1.0
ca            3.0
thal          0.0
Name: 158, dtype: float64

In [45]:

# Expected Prediction
y_test.iloc[7]

1

In [46]:

# Model Prediction
print("LR:",rs_log_reg.predict([ex1]))
print("DT:",rs_log_reg.predict([ex1]))

LR: [1]
DT: [1]



X does not have valid feature names, but LogisticRegression was fitted with feature names


X does not have valid feature names, but LogisticRegression was fitted with feature names



In [49]:

sample = pd.DataFrame(ex1).T
sample
     

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
158,70.0,1.0,3.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,1.0,3.0,0.0


In [50]:
# Prediction
exp.predict(sample)

array([0.97425284])

In [52]:

# Predict Parts
explanation = exp.predict_parts(sample)
explanation.result
     

Unnamed: 0,variable_name,variable_value,variable,cumulative,contribution,sign,position,label
0,intercept,,intercept,0.460484,0.460484,1.0,14,RandomizedSearchCV
1,ca,3.0,ca = 3.0,0.71394,0.253456,1.0,13,RandomizedSearchCV
2,thalach,109.0,thalach = 109.0,0.91443,0.20049,1.0,12,RandomizedSearchCV
3,oldpeak,2.4,oldpeak = 2.4,0.955152,0.040722,1.0,11,RandomizedSearchCV
4,thal,0.0,thal = 0.0,0.9424,-0.012752,-1.0,10,RandomizedSearchCV
5,chol,322.0,chol = 322.0,0.956906,0.014506,1.0,9,RandomizedSearchCV
6,cp,3.0,cp = 3.0,0.967057,0.010151,1.0,8,RandomizedSearchCV
7,restecg,2.0,restecg = 2.0,0.974466,0.00741,1.0,7,RandomizedSearchCV
8,age,70.0,age = 70.0,0.967377,-0.007089,-1.0,6,RandomizedSearchCV
9,sex,1.0,sex = 1.0,0.973539,0.006162,1.0,5,RandomizedSearchCV


In [53]:

# Plot Explanation of Prediction
explanation.plot()
     

In [54]:

# Prediction Profile
pred_profile = exp.predict_profile(sample)
     

Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 290.40it/s]


In [55]:

pred_profile.plot()