https://www.kaggle.com/ronitf/heart-disease-uci?select=heart.csv

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import statistics

from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn import preprocessing
from sklearn.linear_model import LassoCV

# Heart Dataset

In [36]:
heart_df = pd.read_csv("data/heart.csv")
heart_target = 'target'
heart_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# Analysing the Data

In [37]:
# No missing values

heart_df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [38]:
features = ['trestbps','chol','thalach','oldpeak','sex','cp','fbs','restecg','exang','slope','ca','thal','age','target']

for feat in features:
    print("___________________________________________________________")
    print(heart_df[feat].value_counts())

___________________________________________________________
120    37
130    36
140    32
110    19
150    17
138    13
128    12
125    11
160    11
112     9
132     8
118     7
135     6
108     6
124     6
145     5
134     5
152     5
122     4
170     4
100     4
142     3
115     3
136     3
105     3
180     3
126     3
102     2
94      2
144     2
178     2
146     2
148     2
129     1
165     1
101     1
174     1
104     1
172     1
106     1
156     1
164     1
192     1
114     1
155     1
117     1
154     1
123     1
200     1
Name: trestbps, dtype: int64
___________________________________________________________
234    6
204    6
197    6
269    5
212    5
      ..
278    1
281    1
284    1
290    1
564    1
Name: chol, Length: 152, dtype: int64
___________________________________________________________
162    11
160     9
163     9
173     8
152     8
       ..
129     1
128     1
127     1
124     1
71      1
Name: thalach, Length: 91, dtype: int64
______________

# Cleaning the Data

Not much clearning needed to be done. The dataset seems to be well cleaned.

In [21]:
clean_df = heart_df.copy()

In [22]:
clean_df[heart_target].value_counts()

1    165
0    138
Name: target, dtype: int64

# Feature Selection

## Filter Based Method

In [163]:
# Continuous features
heart_continuous_features = ['testbps','chol','thalach','oldpeak', 'age']

# Categorical features (not including response)
heart_categorical_features = ['sex', 'cp','fbs','restecg','exang','slope','ca','thal']

heart_target = 'target'

print(len(heart_continuous_features)+len(heart_categorical_features)+1)

14


In [164]:
featureselection_df = clean_df.copy()

In [165]:
featureselection_df = clean_df.copy()

cor = featureselection_df.corr()

#Correlation with output variable
cor_target = abs(cor[heart_target])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.1]
relevant_features

age         0.225439
sex         0.280937
cp          0.433798
trestbps    0.144931
restecg     0.137230
thalach     0.421741
exang       0.436757
oldpeak     0.430696
slope       0.345877
ca          0.391724
thal        0.344029
target      1.000000
Name: target, dtype: float64

In [166]:
X = featureselection_df.drop(columns=[heart_target])
y = featureselection_df[heart_target].values

for i in range (14):
    print("_______________________________________________________________________________________Best "+str(i)+":")
    # Create and fit selector
    selector = SelectKBest(chi2, k=i)
    selector.fit(X, y)
    # Get columns to keep and create new dataframe with those only
    cols = selector.get_support(indices=True)
    features_df_new = X.iloc[:,cols]
    print(features_df_new.columns)

_______________________________________________________________________________________Best 0:
Index([], dtype='object')
_______________________________________________________________________________________Best 1:
Index(['thalach'], dtype='object')
_______________________________________________________________________________________Best 2:
Index(['thalach', 'oldpeak'], dtype='object')
_______________________________________________________________________________________Best 3:
Index(['thalach', 'oldpeak', 'ca'], dtype='object')
_______________________________________________________________________________________Best 4:
Index(['cp', 'thalach', 'oldpeak', 'ca'], dtype='object')
_______________________________________________________________________________________Best 5:
Index(['cp', 'thalach', 'exang', 'oldpeak', 'ca'], dtype='object')
_______________________________________________________________________________________Best 6:
Index(['cp', 'chol', 'thalach', 'exang', 'oldpeak',

In [167]:
best_features = ['thalach','oldpeak','ca','cp','exang','chol','age','trestbps','slope','sex','thal','restecg','fbs']

best_categorical_features = np.intersect1d(best_features, heart_categorical_features)

In [62]:
featureselection_df[best_categorical_features] = ordEnc.fit(featureselection_df[best_categorical_features]).transform(featureselection_df[best_categorical_features])
featureselection_df[heart_categorical_features] = featureselection_df[heart_categorical_features].astype('category') 

X = featureselection_df[best_features]
y = featureselection_df[heart_target].values

temp = []

for feature in best_features:
    temp.append(feature)
    print("________________________________________________")
    print(temp)
    all_ac = []
    for i in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(X[temp], y, test_size=0.3, stratify=y)
        model = DecisionTreeClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 4, min_samples_split = 8)
        model.fit(X_train, y_train)
        y_preds_rf = model.predict(X_test)
        ac = accuracy_score(y_test, y_preds_rf)
        all_ac.append(ac)
    print('Overall Accuracy Over 1000 Repetitions: %.9f'%statistics.mean(all_ac))

________________________________________________
['thalach']
Overall Accuracy Over 1000 Repetitions: 0.621109890
________________________________________________
['thalach', 'oldpeak']
Overall Accuracy Over 1000 Repetitions: 0.648439560
________________________________________________
['thalach', 'oldpeak', 'ca']
Overall Accuracy Over 1000 Repetitions: 0.688472527
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp']
Overall Accuracy Over 1000 Repetitions: 0.742780220
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang']
Overall Accuracy Over 1000 Repetitions: 0.743076923
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol']
Overall Accuracy Over 1000 Repetitions: 0.736692308
________________________________________________
['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age']
Overall Accuracy Over 1000 Repetitions: 0.737769231
_______________________________________

In [169]:
final_best_features = ['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age', 'trestbps', 'slope', 'sex', 'thal']

featureselection_df = clean_df.copy()
featureselection_df = featureselection_df[final_best_features]
featureselection_df[heart_target] = clean_df[heart_target]

featureselection_df.to_csv("data/filterFeatureSelection.csv")

## Lasso Method

In [172]:
# Feature Selection Using Lasso Method

lasso_df = clean_df.copy()
X = lasso_df[best_features]
y = lasso_df[heart_target].values

reg = LassoCV(cv=5, random_state=0)
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
final_best_features = ['thalach', 'oldpeak', 'ca', 'cp', 'exang', 'chol', 'age', 'trestbps', 'slope', 'sex', 'thal', 'restecg', 'fbs']

featureselection_Lasso_df = clean_df.copy()
featureselection_Lasso_df = featureselection_Lasso_df[final_best_features]
featureselection_Lasso_df[heart_target] = clean_df[heart_target]

featureselection_Lasso_df.to_csv("data/lassoFeatureSelection.csv")

coef

Best alpha using built-in LassoCV: 0.004803
Best score using built-in LassoCV: 0.516022
Lasso picked 12 variables and eliminated the other 1 variables


thalach     0.003369
oldpeak    -0.062537
ca         -0.096794
cp          0.112551
exang      -0.124604
chol       -0.000368
age        -0.000628
trestbps   -0.002029
slope       0.063575
sex        -0.178827
thal       -0.112373
restecg     0.034729
fbs         0.000000
dtype: float64

# Oversampling

In [176]:
#Oversampling to Balance the Dataset

def overSampling(df, name):
    
    oversampling_df = df.copy()

    heart_target = 'target'

    # Balancing (oversampling)
    count_class_1, count_class_0 = oversampling_df[heart_target].value_counts()
    df_class_0 = oversampling_df[oversampling_df[heart_target] == 0]
    df_class_1 = oversampling_df[oversampling_df[heart_target] == 1]
    # Oversample
    df_class_0_over = df_class_0.sample(count_class_1, replace=True)
    final_df = pd.concat([df_class_0_over, df_class_1], axis=0)
    # Split back into X_train and y_train    
    #print('Random over-sampling:\n'+ str(final_df[heart_target].value_counts()))

    final_df.to_csv("data/overSampling_"+name+".csv")
    return final_df

overSampling_df = overSampling(clean_df, 'heart')
overSampling_filter_df = overSampling(featureselection_df, 'filterFeatureSelection')
overSampling_lasso_df = overSampling(featureselection_Lasso_df, 'lassoFeatureSelection')

list_of_datasets = [heart_df, overSampling_df, featureselection_df, overSampling_filter_df, featureselection_Lasso_df, overSampling_lasso_df]
names = ['heart', 'overSampling', 'filter', 'overSampling_filter', 'lasso', 'overSampling_lasso']

In [177]:

for j in range(len(list_of_datasets)):
    
    data = list_of_datasets[j]
    name = names[j]
    
    y = data[heart_target].values
    X = data.drop(columns=[heart_target])
    print("__________________________________________________________"+name)

    all_ac = []
    all_f_macro = []
    all_f_weighted = []
    
    for i in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y)
        model = DecisionTreeClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 4, min_samples_split = 8)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        all_f_macro.append(f1_score(y_test, y_pred, average='macro'))
        all_f_weighted.append(f1_score(y_test, y_pred, average='weighted'))
        all_ac.append(accuracy_score(y_test, y_pred))
        
    print('Accuracy Over 1000 Repetitions: %.5f'%statistics.mean(all_ac))
    print('F1-Score Macro Over 1000 Repetitions: %.5f'%statistics.mean(all_f_macro))
    print('F1-Score Weighted Over 1000 Repetitions: %.5f'%statistics.mean(all_f_weighted))

__________________________________________________________heart
Accuracy Over 1000 Repetitions: 0.75138
F1-Score Macro Over 1000 Repetitions: 0.74956
F1-Score Weighted Over 1000 Repetitions: 0.75102
__________________________________________________________overSampling
Accuracy Over 1000 Repetitions: 0.80798
F1-Score Macro Over 1000 Repetitions: 0.80723
F1-Score Weighted Over 1000 Repetitions: 0.80723
__________________________________________________________filter
Accuracy Over 1000 Repetitions: 0.75162
F1-Score Macro Over 1000 Repetitions: 0.74975
F1-Score Weighted Over 1000 Repetitions: 0.75127
__________________________________________________________overSampling_filter
Accuracy Over 1000 Repetitions: 0.79869
F1-Score Macro Over 1000 Repetitions: 0.79781
F1-Score Weighted Over 1000 Repetitions: 0.79781
__________________________________________________________lasso
Accuracy Over 1000 Repetitions: 0.75003
F1-Score Macro Over 1000 Repetitions: 0.74806
F1-Score Weighted Over 1000 Repe

In [178]:
overSampling_df.to_csv("data/BEST_heart.csv")