Importing Dependencies


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler

importing models

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

Data collection and preprocessing

In [3]:
heart_df=pd.read_csv('/content/drive/MyDrive/heart.csv')

In [4]:
#printing the 5 rows of the datasets
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
#checking the number of rows and columns
heart_df.shape

(303, 14)

In [6]:
#checking null values 
heart_df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
#checking the values count of dependent column
heart_df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

1--> Defective heart\
0--> Healthy heart

Standraization of datasets

In [8]:
scale=StandardScaler()
std_heart_df=pd.DataFrame(scale.fit_transform(heart_df),columns=heart_df.columns)


In [9]:
std_heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873,0.914529
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922,0.914529
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922,0.914529
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922,0.914529
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922,0.914529


In [10]:
#convert all float value to int
std_heart_df=std_heart_df.astype(int)

In [11]:
#checking datatype of columns
std_heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       303 non-null    int64
 1   sex       303 non-null    int64
 2   cp        303 non-null    int64
 3   trestbps  303 non-null    int64
 4   chol      303 non-null    int64
 5   fbs       303 non-null    int64
 6   restecg   303 non-null    int64
 7   thalach   303 non-null    int64
 8   exang     303 non-null    int64
 9   oldpeak   303 non-null    int64
 10  slope     303 non-null    int64
 11  ca        303 non-null    int64
 12  thal      303 non-null    int64
 13  target    303 non-null    int64
dtypes: int64(14)
memory usage: 33.3 KB


spilting the feature and target

In [12]:
X=std_heart_df.iloc[:,:13]
Y=std_heart_df.iloc[:,13]

In [13]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0,0,1,0,0,2,-1,0,0,1,-2,0,-2
1,-1,0,1,0,0,0,0,1,0,2,-2,0,0
2,-1,-1,0,0,0,0,-1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,-1,0,0,2,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0,-1,0,0,0,0,0,-1,1,0,0,0,1
299,-1,0,1,-1,0,0,0,0,0,0,0,0,1
300,1,0,0,0,-1,2,0,0,0,2,0,1,1
301,0,0,0,0,-2,0,0,-1,1,0,0,0,1


In [14]:
Y

0      0
1      0
2      0
3      0
4      0
      ..
298   -1
299   -1
300   -1
301   -1
302   -1
Name: target, Length: 303, dtype: int64

Train Test Split

In [15]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=3)

In [16]:
print(X.shape,x_train.shape,x_test.shape)

(303, 13) (242, 13) (61, 13)


Comparing the performance of the model

In [17]:
#list od model
models=[LogisticRegression(max_iter=1000),SVC(kernel='linear'),KNeighborsClassifier(),RandomForestClassifier()]

In [18]:
def compare_model_train_test():
  for model in models:
    #training the model
    model.fit(x_train,y_train)

    #evaluting the model
    test_data_prediction=model.predict(x_test)

    accuracy=accuracy_score(y_test,test_data_prediction)
    print("accuracy score of model " , model, ':', accuracy)
    

In [19]:
compare_model_train_test()

accuracy score of model  LogisticRegression(max_iter=1000) : 0.7704918032786885
accuracy score of model  SVC(kernel='linear') : 0.7540983606557377
accuracy score of model  KNeighborsClassifier() : 0.7868852459016393
accuracy score of model  RandomForestClassifier() : 0.7377049180327869


Cross Validation

Logistic Regression

In [20]:
cv_score_LR=cross_val_score(LogisticRegression(max_iter=1000),X,Y,cv=5)
print(cv_score_LR)
mean_accuracy_LR=sum(cv_score_LR)/len(cv_score_LR)
mean_accuracy_LR=mean_accuracy_LR*100
mean_accuracy_LR=round(mean_accuracy_LR,2)
print(mean_accuracy_LR)


[0.7704918  0.81967213 0.73770492 0.86666667 0.68333333]
77.56


Support Vector Classifier

In [21]:
cv_score_SVC=cross_val_score(SVC(kernel='linear'),X,Y,cv=5)
mean_accuracy_SVC=sum(cv_score_SVC)/len(cv_score_SVC)
print(cv_score_SVC)
mean_accuracy_SVC=mean_accuracy_SVC*100
mean_accuracy_SVC=round(mean_accuracy_SVC,2)
print(mean_accuracy_SVC)

[0.75409836 0.85245902 0.68852459 0.8        0.66666667]
75.23


Creating the function to comparing the models

In [22]:
#list of models
models=[LogisticRegression(max_iter=1000),SVC(kernel='linear'),KNeighborsClassifier(),RandomForestClassifier()]

In [27]:
def compare_models_cross_validation():
  for model in models:
    cv_score=cross_val_score(model,X,Y,cv=5)
    mean_accuracy=sum(cv_score)/len(cv_score)
    mean_acuracy=mean_accuracy*100
    mean_accuracy=round(mean_accuracy,2)
    print('cross validation  accuracy for model',model ,':',cv_score)
    print("accuracy of the model",model,':',mean_accuracy)
    print("***********")

In [28]:
compare_models_cross_validation()

cross validation  accuracy for model LogisticRegression(max_iter=1000) : [0.7704918  0.81967213 0.73770492 0.86666667 0.68333333]
accuracy of the model LogisticRegression(max_iter=1000) : 0.78
***********
cross validation  accuracy for model SVC(kernel='linear') : [0.75409836 0.85245902 0.68852459 0.8        0.66666667]
accuracy of the model SVC(kernel='linear') : 0.75
***********
cross validation  accuracy for model KNeighborsClassifier() : [0.80327869 0.67213115 0.7704918  0.8        0.68333333]
accuracy of the model KNeighborsClassifier() : 0.75
***********
cross validation  accuracy for model RandomForestClassifier() : [0.7704918  0.75409836 0.75409836 0.78333333 0.7       ]
accuracy of the model RandomForestClassifier() : 0.75
***********
