In [2]:
#Lets import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [5]:
#importing models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [11]:
#loading dataset
heart = pd.read_csv(r"C:\Users\bharathi.v04\Downloads\DS\heart2.csv")

In [12]:
#printing the first 5 rows
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [13]:
#finding the no of rows and columns
heart.shape

(300, 14)

In [14]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       300 non-null    int64  
 1   sex       300 non-null    int64  
 2   cp        300 non-null    int64  
 3   trestbps  300 non-null    int64  
 4   chol      300 non-null    int64  
 5   fbs       300 non-null    int64  
 6   restecg   300 non-null    int64  
 7   thalach   300 non-null    int64  
 8   exang     300 non-null    int64  
 9   oldpeak   300 non-null    float64
 10  slope     300 non-null    int64  
 11  ca        300 non-null    int64  
 12  thal      300 non-null    int64  
 13  target    300 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.9 KB


In [15]:
#finding the null value
heart.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [16]:
#checking the distribution of the target
heart['target'].value_counts()

1    159
0    141
Name: target, dtype: int64

### Splitting the features and target

In [17]:
X = heart.drop(columns='target',axis=1)
Y = heart['target']

In [18]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     52    1   0       125   212    0        1      168      0      1.0   
1     53    1   0       140   203    1        0      155      1      3.1   
2     70    1   0       145   174    0        1      125      1      2.6   
3     61    1   0       148   203    0        1      161      0      0.0   
4     62    0   0       138   294    1        1      106      0      1.9   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
295   67    1   0       100   299    0        0      125      1      0.9   
296   67    1   0       120   237    0        1       71      0      1.0   
297   58    1   0       150   270    0        0      111      1      0.8   
298   35    1   1       122   192    0        1      174      0      0.0   
299   52    1   1       120   325    0        1      172      0      0.2   

     slope  ca  thal  
0        2   2     3  
1        0   0     3  
2        0   0    

In [19]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
295    0
296    0
297    0
298    1
299    1
Name: target, Length: 300, dtype: int64


In [20]:
X = np.asarray(X)
Y = np.asarray(Y)

#### Model Selection

Comparing the model with default hyperparameter value using cross validation

In [23]:
#list of model
models = [LogisticRegression(max_iter=1000),SVC(kernel='linear'),KNeighborsClassifier(),RandomForestClassifier(random_state=0)]

In [26]:
def compare_models_cross_validation():
    
    for model in models:
        
        cv_score = cross_val_score(model, X, Y, cv=5)
        
        mean_accuracy = sum(cv_score)/len(cv_score)
        
        mean_accuracy = mean_accuracy*100
        
        mean_accuracy = round(mean_accuracy,2)
        
        print("Cross Validataion Accuracy for the ",model, ' = ',cv_score)
        print("Accuracy score of the model ",model, ' = ',mean_accuracy,'%') 
        print("----------------------------------------------------------------------------------------------------------")

In [27]:
compare_models_cross_validation()

Cross Validataion Accuracy for the  LogisticRegression(max_iter=1000)  =  [0.85       0.86666667 0.91666667 0.9        0.88333333]
Accuracy score of the model  LogisticRegression(max_iter=1000)  =  88.33 %
----------------------------------------------------------------------------------------------------------
Cross Validataion Accuracy for the  SVC(kernel='linear')  =  [0.85       0.85       0.9        0.9        0.86666667]
Accuracy score of the model  SVC(kernel='linear')  =  87.33 %
----------------------------------------------------------------------------------------------------------
Cross Validataion Accuracy for the  KNeighborsClassifier()  =  [0.63333333 0.7        0.66666667 0.8        0.73333333]
Accuracy score of the model  KNeighborsClassifier()  =  70.67 %
----------------------------------------------------------------------------------------------------------
Cross Validataion Accuracy for the  RandomForestClassifier(random_state=0)  =  [0.86666667 0.9        0.91666

Inference:
    
For the heart disease dataset, RandomForestClassifier has the highest accuracy value with default Hyperparameter values

Comparing the models with different hyperparameters value using GridSearchCV

In [28]:
#list of model
models_list = [LogisticRegression(max_iter=10000),SVC(),KNeighborsClassifier(),RandomForestClassifier(random_state=0)]

In [30]:
# creating the dictionary that contains hyperparameter values for the above mentioned models
model_hyperparameters = {
    
    'log_reg_hyperparameters':{
        'C':[1,5,10,20]
    },
    
    'svc_hyperparameters':{
        'kernel':['linear','poly','rbf','sigmoid'],
        'C':[1,5,10,20]
    },
    
    'KNN_hyperparameters':{
        'n_neighbors': [3,5,10]
    },
    
    'randomforest_hyperparameters':{
        'n_estimators': [10,20,50,100]
    }
    
    
}

In [31]:
type(model_hyperparameters)

dict

In [32]:
model_hyperparameters.keys()

dict_keys(['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'randomforest_hyperparameters'])

In [33]:
model_hyperparameters.values()

dict_values([{'C': [1, 5, 10, 20]}, {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}, {'n_neighbors': [3, 5, 10]}, {'n_estimators': [10, 20, 50, 100]}])

In [34]:
model_keys = list(model_hyperparameters.keys())

In [35]:
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'randomforest_hyperparameters']


In [36]:
model_hyperparameters['log_reg_hyperparameters']

{'C': [1, 5, 10, 20]}

In [39]:
model_keys[3]

'randomforest_hyperparameters'

In [40]:
model_hyperparameters[model_keys[0]]

{'C': [1, 5, 10, 20]}

Applying GridSearchCV

In [46]:
def ModelSelection(list_of_models,hyperparameter_dictionary):
    
    result =[]
    
    i = 0
    
    for model in list_of_models:
        
        key = model_keys[i]
        
        params = hyperparameter_dictionary[key]
        
        i += 1
        
        print(model)
        print(params)
        
        
        classifier = GridSearchCV(model, params, cv=5)
        
        #fitting the data to classifier
        classifier.fit(X,Y)
        
        result.append({
            'model_used':model,
            'highest score':classifier.best_score_,
            'best hyperparameter':classifier.best_params_
            
        })
        
    result_dataframe = pd.DataFrame(result, columns=['model_used','highest score','best hyperparameter'])
        
    return result_dataframe
    
    

In [47]:
ModelSelection(models_list, model_hyperparameters)

LogisticRegression(max_iter=10000)
{'C': [1, 5, 10, 20]}
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}


Unnamed: 0,model_used,highest score,best hyperparameter
0,LogisticRegression(max_iter=10000),0.883333,{'C': 1}
1,SVC(),0.873333,"{'C': 1, 'kernel': 'linear'}"
2,KNeighborsClassifier(),0.716667,{'n_neighbors': 10}
3,RandomForestClassifier(random_state=0),0.91,{'n_estimators': 20}
