# Important Libraries

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import torch
import torch.nn as nn # neural network
import torch.nn.functional as F
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest, chi2,f_classif,RFE
from sklearn.metrics import roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns


# Loading Transformed Dataset

In [2]:
df=pd.read_csv("transformed_CVD_data.csv")
df.head(10)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.391781,2,168,62.0,110,80,1,1,0,0,1,0
1,55.419178,1,156,85.0,140,90,3,1,0,0,1,1
2,51.663014,1,165,64.0,130,70,3,1,0,0,0,1
3,48.282192,2,169,82.0,150,100,1,1,0,0,1,1
4,47.873973,1,156,56.0,100,60,1,1,0,0,0,0
5,60.038356,1,151,67.0,120,80,2,2,0,0,0,0
6,60.583562,1,157,93.0,130,80,3,1,0,0,1,0
7,61.873973,2,178,95.0,130,90,3,3,0,0,1,1
8,48.405479,1,158,71.0,110,70,1,1,0,0,1,0
9,54.339726,1,164,68.0,110,60,1,1,0,0,0,0


# Spliting Data into Train & Test Set

In [3]:
y=df['cardio'] 
x=df.drop(['cardio'],axis=1)

In [6]:
# xtr : X Input Training Data
# xts : X Input Testing Data
# ytr : Y Traget Variable Input
# yts : Y Target Variable Testing
xtr,xts,ytr,yts=train_test_split(x,y,test_size=0.25,random_state=23,stratify=y)

# Creating Pipeline

In [9]:
LogisticRegressionPipeline=Pipeline([('scaler', StandardScaler()),
                                     ('logistic_classifier', LogisticRegression())])

In [19]:
RandomForestClassifierPipeline=Pipeline([('scaler', StandardScaler()),
                                     ('rf', RandomForestClassifier(n_estimators=100,max_depth=5,min_samples_leaf=5,min_samples_split=3))])

params= {
'rf__n_estimators': [120, 140],
'rf__max_depth': [30, 50],
'rf__min_samples_split': [2, 3],
'rf__min_samples_leaf': [3, 5],
'rf__class_weight': [{0: 1, 1: 1}, {0: 1, 1:5},{0:1,1:3}, 'balanced']}

rf_cv=GridSearchCV(RandomForestClassifierPipeline,param_grid=params ,scoring = 'roc_auc', cv = 3)
rf_cv.fit(xtr, ytr)

# Get the best hyperparameters
best_params = rf_cv.best_params_

# Print the best hyperparameters
print(best_params)

{'rf__class_weight': 'balanced', 'rf__max_depth': 50, 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 2, 'rf__n_estimators': 140}


In [21]:
RandomForestClassifier_Pipeline=Pipeline([('scaler', StandardScaler()),
                                     ('rf', RandomForestClassifier(class_weight= 'balanced', max_depth=50, min_samples_leaf= 5, min_samples_split=2, n_estimators=140))])


In [14]:
SVMPipeline=Pipeline([('scaler', StandardScaler()),
                      ('svm', SVC(gamma="scale",C=5))])

In [15]:
XGBClassifierPipeline=Pipeline([('scaler', StandardScaler()),
                      ('model', XGBClassifier( gamma= 4.198875359789924, max_depth= 17, min_child_weight= 1, reg_alpha= 57))])

In [16]:
GradientBoostingClassifierPipeline=Pipeline([('scaler', StandardScaler()),
                                             ('gbc', GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10))])

In [22]:
mypipeline=[LogisticRegressionPipeline, RandomForestClassifier_Pipeline, SVMPipeline, XGBClassifierPipeline, GradientBoostingClassifierPipeline]

In [23]:
mypipeline=[LogisticRegressionPipeline, RandomForestClassifierPipeline, SVMPipeline, XGBClassifierPipeline, GradientBoostingClassifierPipeline]
for mypipe in mypipeline:
    mypipe.fit(xtr,ytr)
    cv=ShuffleSplit(n_splits=5,test_size=0.3,random_state=0)
    scores = cross_val_score(mypipe, xtr, ytr, cv=cv)
    print("{} Test Acurracy: {}".format(mypipe[1], max(scores)))

LogisticRegression() Test Acurracy: 0.7279973649538867
RandomForestClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=3) Test Acurracy: 0.7304347826086957
SVC(C=5) Test Acurracy: 0.7333333333333333
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=4.198875359789924, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=17,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=57, reg_lambda=1, ...) Test Acurracy: 0.733399209486166
GradientBoostingClassifier(