# Heart-stroke prediction

1. #3 (age): age in years 
2. #4 (sex): sex (1 = male; 0 = female) 
3. #9 (cp): cp: chest pain type(göğüs agrısı) | Value 0: typical angina | Value 1: atypical angina | Value 2: non-anginal pain | Value 3: asymptomatic 
4. #10 (trestbps): resting blood pressure (in mm Hg on admission to the hospital) 
5. #12 (chol): serum cholestoral in mg/dl 
6. #16 (fbs): (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
7. #19 (restecg): resting electrocardiographic results | Value 0: normal | Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) | Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
8. #32 (thalach): maximum heart rate achieved 
9. #38 (exang): exercise induced angina (1 = yes; 0 = no) 
10. #40 (oldpeak): ST depression induced by exercise relative to rest (Dinlenmeye göre egzersizin neden olduğu ST depresyonu)
11. #41 (slope): the slope of the peak exercise ST segment | Value 1: upsloping | Value 2: flat | Value 3: downsloping (maxium egzersizde ST segmentinin eğimi)
12. #44 (ca): number of major vessels (0-3) colored by flourosopy 
13. #51 (thal): 3 = normal; 6 = fixed defect; 7 = reversable defect 
14. #58 (num) (the predicted attribute): Value 0: < 50% diameter narrowing | Value 1: > 50% diameter narrowing 

    https://archive.ics.uci.edu/ml/datasets/heart+disease

# Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# import pandas_profiling as pp
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, log_loss, recall_score,accuracy_score,precision_score,f1_score
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.cluster import KElbowVisualizer
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

## Exploratory Data Analysis and Visualization

In [2]:
df = pd.read_csv('heart.csv')

In [3]:
df.sample(10) 
# cp(göğüs agrısı tipi) ,restecg(dinlenme kardiografi sonuçları) ,slope(maxium egzersizde ST segmentinin eğilimi) , thal

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
113,43,1,0,110,211,0,1,161,0,0.0,2,0,3,1
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2,1
50,51,0,2,130,256,0,0,149,0,0.5,2,0,2,1
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2,0
245,48,1,0,124,274,0,0,166,0,0.5,1,0,3,0
290,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
267,49,1,2,118,149,0,0,126,0,0.8,2,3,2,0
105,68,0,2,120,211,0,0,115,0,1.5,1,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [8]:
df.duplicated().sum()

1

In [5]:
df.shape

(303, 14)

In [9]:
df = df.drop_duplicates()

In [10]:
df.shape

(302, 14)

In [12]:
df.thal.value_counts()
#(thal): 3 = normal; 6 = fixed defect; 7 = reversable defec

2    165
3    117
1     18
0      2
Name: thal, dtype: int64

In [None]:
df[df['thal']==0]

In [None]:
df.drop(df.loc[df['thal']==0].index, inplace=True)

In [None]:
df.thal.value_counts(), df.cp.value_counts(), df.slope.value_counts(), df.restecg.value_counts()

In [None]:
df.columns

In [None]:
df['cp']=df['cp'].replace({0:'typical angina',1:'atypical angina',2:'non-anginal pain',3:'asymptomatic'})

In [None]:
df['restecg']=df['restecg'].replace({0:'normal',1:'having ST-T wave abnormality',2:'left ventricular hypertrophy'})

In [None]:
df['slope']=df['slope'].replace({0:'upsloping',1:'flat',2:'downsloping'})

In [None]:
df['thal']=df['thal'].replace({1:'normal',2:'fixed defect',3:'reversable defect'})

In [None]:
df

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
df.target.value_counts(normalize=True)

In [None]:
sns.countplot(df.target);

In [None]:
sns.boxplot(df.age);

In [None]:
sns.boxplot(df.trestbps); # dinlenme kan basıncı

In [None]:
#df=df[df.trestbps<170]

In [None]:
sns.boxplot(df.chol);

In [None]:
#df=df[df.chol<370]

In [None]:
sns.boxplot(df.thalach);  # max kalp atışı

In [None]:
#df=df[df.thalach>80]  

In [None]:
sns.boxplot(df.oldpeak);  # egzersize bağlı st depresyonu

In [None]:
#df=df[df.oldpeak<4]

In [None]:
sns.boxplot(df.ca);  # floroskopi ile renklendirilen ana damar sayısı (0-3)

In [None]:
#df=df[df.ca<2.5]

In [None]:
sns.pairplot(df.select_dtypes(include='number'), hue = "target")  # Green:1 Blue:0

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True)

## Train | Test Split and Scaling

In [None]:
df=pd.get_dummies(df,drop_first=True)
df

In [None]:
df.info()

In [None]:
X=df.drop(["target"], axis=1)
y=df["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,stratify=y, random_state=5)

In [None]:
scaler =MinMaxScaler() 

In [None]:
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
X_test_scaled = scaler.transform(X_test)

# Logistic Regression

In [None]:
log_model = LogisticRegression()

In [None]:
log_model.fit(X_train_scaled, y_train)

In [None]:
y_pred=log_model.predict(X_test_scaled)

In [None]:
y_pred_proba = log_model.predict_proba(X_test_scaled)

## Model Performance

In [None]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

### Cross Validate

In [None]:
model = LogisticRegression()

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['precision','recall','f1','accuracy'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]     

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)  #   accuracy:0.82   

### GridSearchCV

In [None]:
C = np.logspace(-1, 5, 100)
C

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

penalty = ["l1", "l2"]
C = np.logspace(-1, 5, 20)
class_weight= ["balanced", None] 
solver = ["lbfgs", "liblinear", "sag", "saga"]

param_grid = {"penalty" : penalty,
              "C" : [C,1],
              "class_weight":class_weight,
              "solver":solver}


grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=10,
                          scoring = 'accuracy',       
                          n_jobs = -1)

In [None]:
grid_model.fit(X_train_scaled,y_train)

In [None]:
grid_model.best_params_

In [None]:
eval_metric(grid_model, X_train_scaled, y_train, X_test_scaled, y_test)  

# Train_set accuracy : 0.85------->0.86

# Test_set accuracy : 0.89 (0.82 cross validation)------->0.89

# log_model final  Accuracy Score: 0.89 ------------- > 7 wrong prediction

## ROC (Receiver Operating Curve) and AUC (Area Under Curve)

In [None]:
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_auc_score, auc, roc_curve, average_precision_score, precision_recall_curve

In [None]:
plot_roc_curve(grid_model, X_test_scaled, y_test)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
knn_pred = knn.predict(X_test_scaled)

In [None]:
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

### Elbow Method for Choosing Reasonable K Values

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
test_error_rates = []


for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train_scaled,y_train) 
   
    y_pred_test = knn_model.predict(X_test_scaled)
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(1,30), test_error_rates, color='blue', linestyle='--', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K_values')
plt.ylabel('Error Rate')
plt.hlines(y=0.197, xmin = 0, xmax = 30, colors= 'r', linestyles="--")
plt.hlines(y=0.13, xmin = 0, xmax = 30, colors= 'r', linestyles="--")

### Scores by Various K Values

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)  # accuracy:0.87 -------> 8 wrong prediction
knn.fit(X_train_scaled,y_train)
print('WITH K=6\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)  

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)  # accuracy:0.84 -------> 10 wrong prediction

knn.fit(X_train_scaled,y_train)
print('WITH K=5\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)  

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)   # accuracy:0.85 -------> 9 wrong prediction

knn.fit(X_train_scaled,y_train)
print('WITH K=7\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=14)   # accuracy:0.85 -------> 9 wrong prediction

knn.fit(X_train_scaled,y_train)
print('WITH 14K=\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)   # accuracy:0.82 -------> 11 wrong prediction

knn.fit(X_train_scaled,y_train)
print('WITH 10K=\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

### Cross Validate

In [None]:
model =KNeighborsClassifier(n_neighbors=6)  

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['precision','recall','f1','accuracy'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]        

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)  # test_accuracy:    0.87 , train_accuracy :  0.86
                                              
                                           # test_accuracy     0.82  (cross validation)
                                            
                                           # knn final test_accuracy :   0.87     (k=6 with elbow) with 8 wrong prediction
knn.fit(X_train_scaled,y_train)
print('WITH K=6\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)  


### Gridsearch Method for Choosing Reasonable K Values

In [None]:
knn_grid = KNeighborsClassifier()

In [None]:
k_values= range(1,30)

In [None]:
param_grid = {"n_neighbors":k_values, "p": [1,2], "weights": ['uniform', "distance"]}
# p=1(manhattan), p=2(euclidan)
# uniform(equal weight), distance(weight based distance)

In [None]:
knn_grid_model = GridSearchCV(knn_grid, param_grid, cv=10, scoring= 'accuracy')

In [None]:
knn_grid_model.fit(X_train_scaled, y_train)

In [None]:
knn_grid_model.best_params_

In [None]:
print('WITH K=10\n')      #  knn      test_accuracy :   0.82  (k=10 with gridsearch) with 11 wrong prediction

                          #  knn      test_accuracy :   0.87  (k=6 with elbow)  with 8 wrong prediction
eval_metric(knn_grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

### Evaluating ROC Curves and AUC

In [None]:
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, plot_roc_curve, roc_auc_score, roc_curve

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=10).fit(X_train_scaled, y_train)

In [None]:
plot_roc_curve(knn_model, X_test_scaled, y_test) #  knn test_accuracy :   0.82  (k=10 with gridsearch) with 11 wrong prediction

In [None]:
y_pred_proba = knn_model.predict_proba(X_test_scaled)
roc_auc_score(y_test, y_pred_proba[:,1])

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=6).fit(X_train_scaled, y_train)

In [None]:
plot_roc_curve(knn_model, X_test_scaled, y_test)  #  knn test_accuracy :   0.87  (k=6 with elbow)  with 8 wrong prediction

# Final Model

In [None]:
# log_model  Accuracy Score: 0.89 ------------- > 7 wrong prediction