In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
dataset = pd.read_csv("heart_disease_uci.csv")

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


Handling trestbps outlier


In [6]:
dataset[dataset['trestbps']==0]

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
753,754,55,Male,VA Long Beach,non-anginal,0.0,0.0,False,normal,155.0,False,1.5,flat,,,3


In [7]:
dataset['trestbps'] = dataset['trestbps'].replace(0, np.nan)

In [8]:
dataset['trestbps'].min(skipna = True)

80.0

Handling oldpeak outlier

In [9]:
len(dataset[dataset['oldpeak'] <0])

12

In [10]:
dataset['oldpeak']  = dataset['oldpeak'].clip(lower = 0, upper = 6)

In [11]:
len(dataset[dataset['oldpeak'] <0])

0

Splitting the dataset

In [12]:
X = dataset.iloc[:,1:-1]
y = dataset.iloc[: , -1].values

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Scaling

In [14]:
dataset.select_dtypes(include=[np.number]).columns.tolist()

['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

num_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'mean')),
        ('scaler', StandardScaler())
    ]
)
cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('encoder', OneHotEncoder(drop = 'first'))
    ]
)
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)


In [16]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [17]:
print(X_train[:5])

[[ 9.02412547e-01  1.57774920e-15 -2.54935123e-01 -6.88473319e-01
   2.00217983e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   1.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 3.47761857e-02  1.00702194e+00  0.00000000e+00 -6.09262363e-01
  -8.55320196e-01  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [-2.90587450e-01  1.57774920e-15  1.29687589e+00  0.00000000e+00
  -1.05748744e-16  1.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [-3.99

In [18]:
X_train.shape

(736, 20)

In [35]:
feature_names = (num_features +
                 list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_features)))

In [36]:
X_df = pd.DataFrame(X_train, columns = feature_names)

In [37]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       736 non-null    float64
 1   trestbps                  736 non-null    float64
 2   chol                      736 non-null    float64
 3   thalch                    736 non-null    float64
 4   oldpeak                   736 non-null    float64
 5   sex_Male                  736 non-null    float64
 6   cp_atypical angina        736 non-null    float64
 7   cp_non-anginal            736 non-null    float64
 8   cp_typical angina         736 non-null    float64
 9   fbs_True                  736 non-null    float64
 10  restecg_normal            736 non-null    float64
 11  restecg_st-t abnormality  736 non-null    float64
 12  exang_True                736 non-null    float64
 13  slope_flat                736 non-null    float64
 14  slope_upsl

Multicollinearity Check

In [25]:
!pip install statsmodels



In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [38]:
X_with_const  = add_constant(X_df)

In [40]:
vif_data  = pd.DataFrame()
vif_data['Features'] = X_with_const.columns
vif_data['VIF']  = [variance_inflation_factor(X_with_const.values, i) for i in range(len(X_with_const.columns))]

In [41]:
print(vif_data)

                    Features        VIF
0                      const  47.175159
1                        age   1.484098
2                   trestbps   1.141740
3                       chol   1.246632
4                     thalch   1.671888
5                    oldpeak   1.461274
6                   sex_Male   1.144915
7         cp_atypical angina   1.464758
8             cp_non-anginal   1.286866
9          cp_typical angina   1.126897
10                  fbs_True   1.125277
11            restecg_normal   1.846149
12  restecg_st-t abnormality   1.860680
13                exang_True   1.478657
14                slope_flat   3.594590
15           slope_upsloping   3.746399
16                    ca_1.0   1.184070
17                    ca_2.0   1.142164
18                    ca_3.0   1.143961
19               thal_normal   4.144987
20    thal_reversable defect   4.102337
