In [None]:
import numpy as np
import pandas as pd

In [None]:
cvd_data = pd.read_csv("CVD_cleaned.csv")

In [None]:

print("Unique Values of General_Health Column: ", cvd_data['General_Health'].unique())
print("Unique Values of Checkup Column: ", cvd_data['Checkup'].unique())
print("Unique Values of Age_Category Column: ", cvd_data['Age_Category'].unique())
print("Unique Values of Exercise Column: ", cvd_data['Exercise'].unique())
print("Unique Values of Heart_Disease Column: ", cvd_data['Heart_Disease'].unique())
print("Unique Values of Skin_Cancer Column: ", cvd_data['Skin_Cancer'].unique())
print("Unique Values of Other_Cancer Column: ", cvd_data['Other_Cancer'].unique())
print("Unique Values of Depression Column: ", cvd_data['Depression'].unique())
print("Unique Values of Diabetes Column: ", cvd_data['Diabetes'].unique())
print("Unique Values of Arthritis Column: ", cvd_data['Arthritis'].unique())
print("Unique Values of Smoking_History Column: ", cvd_data['Smoking_History'].unique())

In [None]:
ordinal_cols = ["General_Health",
                "Checkup",
                "Age_Category",
                "Diabetes",
                "Exercise",
                "Heart_Disease",
                "Skin_Cancer",
                "Other_Cancer",
                "Depression",
                "Arthritis",
                "Smoking_History"]

ordinal_categories = [
     ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'],
     ['Never', '5 or more years ago', 'Within the past 5 years', 'Within the past 2 years', 'Within the past year'],
     ['18-24','25-29','30-34','35-39','40-44','45-49','50-54',
 '55-59','60-64','65-69','70-74','75-79','80+'],
     ["No","No, pre-diabetes or borderline diabetes","Yes, but female told only during pregnancy","Yes"],
     ["No","Yes"],
     ["No","Yes"],
     ["No","Yes"],
     ["No","Yes"],
     ["No","Yes"],
     ["No","Yes"],
     ["No","Yes"]
]

onehot_cols = ["Sex"]

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

or_enc = OrdinalEncoder(categories=ordinal_categories)
onehot_enc = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

transformer_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("ordinal", or_enc, ordinal_cols),
            ("one_hot", onehot_enc, onehot_cols)
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )
).set_output(transform="pandas")

transformed_cvd_train = transformer_pipeline.fit_transform(cvd_data)

In [None]:
cvd_Y = transformed_cvd_train["Heart_Disease"]
cvd_X = transformed_cvd_train.drop("Heart_Disease",axis=1)

In [None]:
from sklearn.model_selection import train_test_split
cvd_train_vali_X,cvd_holdout_X, cvd_train_vali_Y,cvd_holdout_Y = train_test_split(cvd_X,cvd_Y, test_size=0.3, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
scale_cols = list(cvd_train_vali_X.columns)
cols_trfrm = ColumnTransformer(transformers=[("scaler", scaler, scale_cols)],
                               remainder="passthrough",
                               verbose_feature_names_out=False)
model_lr=LogisticRegression()
model_lr_pipeline = Pipeline(steps=[("cols_transform", cols_trfrm), ("logistic", model_lr)])

param_grid_lr = {'logistic__C':np.arange(0.1,2,0.1)}
clf_lr = GridSearchCV(model_lr_pipeline, param_grid_lr, return_train_score=True, scoring="accuracy", verbose=3)
clf_lr.fit(cvd_train_vali_X, cvd_train_vali_Y)

train_scores_lr = clf_lr.cv_results_['mean_train_score']
validation_scores_lr = clf_lr.cv_results_['mean_test_score']

best_validation_score_lr = validation_scores_lr.min()
best_train_scores_lr = train_scores_lr[validation_scores_lr.argmin()]
best_params_lr = clf_lr.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf=RandomForestClassifier(random_state=0)
param_grid_rf = {'n_estimators':np.array([5,10,20]),
                 'max_features':np.arange(1,11,2),
                 'max_depth':np.arange(1,11,3)}
clf_rf = GridSearchCV(model_rf, param_grid_rf, return_train_score=True, scoring="accuracy", verbose=3)
clf_rf.fit(cvd_train_vali_X, cvd_train_vali_Y)

train_scores_rf = clf_rf.cv_results_['mean_train_score']
validation_scores_rf = clf_rf.cv_results_['mean_test_score']

best_validation_score_rf = validation_scores_rf.min()
best_train_scores_rf = train_scores_rf[validation_scores_rf.argmin()]
best_params_rf = clf_rf.best_params_


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model_erf=ExtraTreesClassifier(random_state=0)
param_grid_erf = {'n_estimators':np.array([5,10,20]),
                 'max_features':np.arange(1,11,2),
                 'max_depth':np.arange(1,11,3)}
clf_erf = GridSearchCV(model_erf, param_grid_erf, return_train_score=True, scoring="accuracy", verbose=3)
clf_erf.fit(cvd_train_vali_X, cvd_train_vali_Y)

train_scores_erf = clf_erf.cv_results_['mean_train_score']
validation_scores_erf = clf_erf.cv_results_['mean_test_score']

best_validation_score_erf = validation_scores_erf.min()
best_train_scorees_erf = train_scores_erf[validation_scores_erf.argmin()]
best_params_erf = clf_erf.best_params_

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model_ab=AdaBoostClassifier(random_state=0)
param_grid_ab = {'n_estimators':np.array([10,30,50,80,100]),
                 'learning_rate':np.arange(0.1,2,0.2)}
clf_ab = GridSearchCV(model_ab, param_grid_ab, return_train_score=True, scoring="accuracy", verbose=3)
clf_ab.fit(cvd_train_vali_X, cvd_train_vali_Y)

train_scores_ab = clf_ab.cv_results_['mean_train_score']
validation_scores_ab = clf_ab.cv_results_['mean_test_score']

best_validation_score_ab = validation_scores_ab.min()
best_train_scorees_ab = train_scores_ab[validation_scores_ab.argmin()]
best_params_ab = clf_ab.best_params_

In [None]:
import xgboost as xgb

model_xg=xgb.XGBClassifier()
param_grid_xg = {'n_estimators':np.array([5,10,20]),
                 'learning_rate':np.arange(0.1,2,0.2),
                 'max_depth':np.arange(1,11,3)}
clf_xg = GridSearchCV(model_xg, param_grid_xg, return_train_score=True, scoring="accuracy", verbose=3)
clf_xg.fit(cvd_train_vali_X, cvd_train_vali_Y)

train_scores_xg = clf_xg.cv_results_['mean_train_score']
validation_scores_xg = clf_xg.cv_results_['mean_test_score']

best_validation_score_xg = validation_scores_xg.min()
best_train_scorees_xg = train_scores_xg[validation_scores_xg.argmin()]
best_params_xg = clf_xg.best_params_