In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [2]:
df= pd.read_csv('heart_ds.csv') 
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Target
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [3]:
df['ChestPainType'].value_counts()

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [4]:
df['ExerciseAngina'].value_counts()

N    547
Y    371
Name: ExerciseAngina, dtype: int64

In [5]:
df['Sex'].value_counts()

M    725
F    193
Name: Sex, dtype: int64

In [6]:
df['RestingECG'].value_counts()

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

In [7]:
df['ST_Slope'].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

Sex: sex of the patient [M: Male - 1, F: Female - 0]
ChestPainType: chest pain type 
[TA: Typical Angina - 3, 
ATA: Atypical Angina - 1, 
NAP: Non-Anginal Pain - 2, 
ASY: Asymptomatic - 0 ]
RestingECG: resting electrocardiogram results [
Normal: Normal - 1, 
ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) - 2, 
LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria - 0]
ExerciseAngina: exercise-induced angina [Y: Yes - 1, N: No - 0]
ST_Slope: the slope of the peak exercise ST segment [
Up: upsloping - 2, 
Flat: flat - 1, 
Down: downsloping - 0]

In [8]:
Encoder = LabelEncoder()
for column in ['Sex', 'ChestPainType', 'RestingECG','ExerciseAngina', 'ST_Slope']:
    df[column] = Encoder.fit_transform(df[column])

In [9]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
Target            0
dtype: int64

In [10]:
scalar_min_max = MinMaxScaler()
df[['MaxHR', 'Cholesterol', 'Age', 'RestingBP']] = scalar_min_max.fit_transform(df[['MaxHR', 'Cholesterol', 'Age', 'RestingBP']])

In [11]:
df.head(50)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Target
0,0.244898,1,1,0.7,0.47927,0,1,0.788732,0,0.0,2,0
1,0.428571,0,2,0.8,0.298507,0,1,0.676056,0,1.0,1,1
2,0.183673,1,1,0.65,0.46932,0,2,0.267606,0,0.0,2,0
3,0.408163,0,0,0.69,0.354892,0,1,0.338028,1,1.5,1,1
4,0.530612,1,2,0.75,0.323383,0,1,0.43662,0,0.0,2,0
5,0.22449,1,2,0.6,0.562189,0,1,0.774648,0,0.0,2,0
6,0.346939,0,1,0.65,0.393035,0,1,0.774648,0,0.0,2,0
7,0.530612,1,1,0.55,0.344942,0,1,0.577465,0,0.0,2,0
8,0.183673,1,0,0.7,0.343284,0,1,0.492958,1,1.5,1,1
9,0.408163,0,1,0.6,0.470978,0,1,0.422535,0,0.0,2,0


In [12]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= .2, random_state=42)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
RandomForestClassifier = RandomForestClassifier(n_estimators=6)
RandomForestClassifier.fit(X_train, y_train)

acc_random_forest = round(RandomForestClassifier.score(X_train, y_train), 3) 
random_forest = round(RandomForestClassifier.score(X_test, y_test), 3) 

print("RandomForestClassifier - Train: ", acc_random_forest)
print("RandomForestClassifier - Test: ", random_forest)

RandomForestClassifier - Train:  0.981
RandomForestClassifier - Test:  0.821


In [15]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

acc_log = round(logreg.score(X_train, y_train), 3)
log = round(logreg.score(X_test, y_test), 3)

print("LogisticRegression - Train: ", acc_log)
print("LogisticRegression - Test: ", log)

LogisticRegression - Train:  0.861
LogisticRegression - Test:  0.848


In [16]:
knn_model = KNeighborsClassifier(n_neighbors=1) 
knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)

acc_k_neigh = accuracy_score(y_test, y_pred)
print(f'KNeighborsClassifier - Accuracy: {acc_k_neigh:.3f}')

KNeighborsClassifier - Accuracy: 0.804


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79        77
           1       0.88      0.77      0.82       107

    accuracy                           0.80       184
   macro avg       0.80      0.81      0.80       184
weighted avg       0.82      0.80      0.81       184


In [18]:
MN = GaussianNB()
MN.fit(X_train,y_train)

y_pred = MN.predict(X_test)

acc_MN = round(accuracy_score(y_pred,y_test), 3)
print("GaussianNB - Accuracy: ", acc_MN)

GaussianNB - Accuracy:  0.842


In [19]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)

acc_des_tree = accuracy_score(y_test, y_pred)
print(f'DecisionTreeClassifier - Accuracy: {acc_des_tree:.3f}')

DecisionTreeClassifier - Accuracy: 0.788


In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.83      0.77        77
           1       0.86      0.76      0.81       107

    accuracy                           0.79       184
   macro avg       0.79      0.79      0.79       184
weighted avg       0.80      0.79      0.79       184


In [21]:
svc_model = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=42)

X_scaled = scaler.fit_transform(X)

svc_model = SVC()

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores_kfold = cross_val_score(svc_model, X_scaled, y, cv=kfold, scoring='accuracy')
svc_model.fit(X_scaled, y)

print(f'Accuracy Scores: {accuracy_scores_kfold}')
print(f'Mean Accuracy: {accuracy_scores_kfold.mean():.3f}')

Accuracy Scores: [0.86413043 0.86956522 0.89130435 0.83060109 0.85245902]
Mean Accuracy: 0.862


In [22]:
ridge_classifier = RidgeClassifier(alpha=1.0) 
ridge_classifier.fit(X_train_scaled, y_train)

y_pred = ridge_classifier.predict(X_test_scaled)
acc_ridge = accuracy_score(y_test, y_pred)
print(f'RidgeClassifier - Accuracy: {acc_ridge:.3f}')

RidgeClassifier - Accuracy: 0.837


In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.87      0.82        77
           1       0.90      0.81      0.85       107

    accuracy                           0.84       184
   macro avg       0.83      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184


In [24]:
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_classifier.fit(X_train_scaled, y_train)

y_pred = gradient_boosting_classifier.predict(X_test_scaled)
acc_grad_boost = accuracy_score(y_test, y_pred)
print(f'GradientBoostingClassifier - Accuracy: {acc_grad_boost:.3f}')

GradientBoostingClassifier - Accuracy: 0.875


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86        77
           1       0.92      0.86      0.89       107

    accuracy                           0.88       184
   macro avg       0.87      0.88      0.87       184
weighted avg       0.88      0.88      0.88       184


In [26]:
extra_trees_classifier = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42)
extra_trees_classifier.fit(X_train_scaled, y_train)

y_pred = extra_trees_classifier.predict(X_test_scaled)
acc_extra_tree = accuracy_score(y_test, y_pred)
print(f'ExtraTreesClassifier - Accuracy: {acc_extra_tree:.3f}')

ExtraTreesClassifier - Accuracy: 0.886


In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        77
           1       0.92      0.88      0.90       107

    accuracy                           0.89       184
   macro avg       0.88      0.89      0.88       184
weighted avg       0.89      0.89      0.89       184


In [28]:
dummy_classifier = DummyClassifier(strategy='uniform')  
dummy_classifier.fit(X_train_scaled, y_train)

y_pred = dummy_classifier.predict(X_test_scaled)

acc_dummy = accuracy_score(y_test, y_pred)
print(f'DummyClassifier - Accuracy: {acc_dummy:.2f}')

DummyClassifier - Accuracy: 0.56


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.53      0.50        77
           1       0.63      0.58      0.60       107

    accuracy                           0.56       184
   macro avg       0.55      0.56      0.55       184
weighted avg       0.57      0.56      0.56       184


In [30]:
k_neighbors_regressor = KNeighborsRegressor(n_neighbors=5) 
k_neighbors_regressor.fit(X_train_scaled, y_train)


y_pred = k_neighbors_regressor.predict(X_test_scaled)

mse_kreg = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse_kreg:.2f}')
print(f'R2 Score: {r2:.2f}')

Mean Squared Error: 0.11
R2 Score: 0.54


In [31]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train_scaled, y_train)

y_pred = linear_regression_model.predict(X_test_scaled)

mse_lin = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse_lin:.3f}')
print(f'R2 Score: {r2:.2f}')

Mean Squared Error: 0.137
R2 Score: 0.44


In [32]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_scaled, y_train)


best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

y_pred = grid_search.predict(X_test_scaled)


acc_grid = accuracy_score(y_test, y_pred)
print(f'GridSearchCV - Accuracy: {acc_grid:.2f}')

Best Parameters: {'classifier__C': 1, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
GridSearchCV - Accuracy: 0.88


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.88       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.88      0.88      0.88       184


In [34]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'GridSearchCV', 'Ridge Classifier', 'Linear Regression',
              'Gradient Boosting Classifier', 'Extra Trees Classifier', 'KNeighbors Classifier', 'KNeighbors Regressor',
              'Gaussian NB', 'Decision Tree Classifier', 'DummyClassifier'],
    
    'Testing_score' : [log, random_forest, acc_grid, acc_ridge, mse_lin, acc_grad_boost, acc_extra_tree, acc_k_neigh,
                       mse_kreg, acc_MN, acc_des_tree, acc_dummy]})

models.sort_values(by='Testing_score', ascending=False)

Unnamed: 0,Model,Testing_score
6,Extra Trees Classifier,0.88587
2,GridSearchCV,0.875
5,Gradient Boosting Classifier,0.875
0,Logistic Regression,0.848
9,Gaussian NB,0.842
3,Ridge Classifier,0.836957
1,Random Forest,0.821
7,KNeighbors Classifier,0.804348
10,Decision Tree Classifier,0.788043
11,DummyClassifier,0.559783
