In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [2]:
df = pd.read_csv('../data/heart_disease3.csv')
df.drop(df.columns[[0]], inplace=True, axis=1)
X = df.iloc[:,0:5]
y = df.iloc[:,5]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

param_grid_lr = [{
    'tol' : [0.0001,0.001,0.01,0.1],
    'C' : [1.0, 2.0, 3.0],
    'max_iter' : [100, 1000, 10000]
}]

grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=2, scoring='accuracy', return_train_score=True, n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

In [5]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Logistic Regression Best Score\n\n')
    f.write(str(grid_search_lr.best_score_) + '\n')

grid_search_lr.best_score_

0.8100697906281156

In [6]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Logistic Regression Best Paremeters\n\n')
    f.write(str(grid_search_lr.best_params_) + '\n')

grid_search_lr.best_params_

{'C': 2.0, 'max_iter': 100, 'tol': 0.01}

In [7]:
random_grid_search_lr = RandomizedSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1, random_state=42, n_iter=10)
random_grid_search_lr.fit(X_train, y_train)

In [8]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Logistic Regression Best Score\n\n')
    f.write(str(random_grid_search_lr.best_score_) + '\n')

random_grid_search_lr.best_score_

0.8101950354609929

In [9]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Logistic Regression Best Paremeters\n\n')
    f.write(str(random_grid_search_lr.best_params_) + '\n')

random_grid_search_lr.best_params_

{'tol': 0.001, 'max_iter': 100, 'C': 2.0}

In [10]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

param_grid_dtc = [{
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [10, 20, 30],
    'min_samples_split' : [2, 4, 6],
    'min_samples_leaf' : [1, 2, 4]
}]

grid_search_dtc = GridSearchCV(dtc, param_grid_dtc, cv=2, scoring='accuracy', return_train_score=True, n_jobs=-1)
grid_search_dtc.fit(X_train, y_train)

In [11]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Decision Tree Best Score\n\n')
    f.write(str(grid_search_dtc.best_score_) + '\n')

grid_search_dtc.best_score_

0.8143426862270332

In [12]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Decision Tree Best Parameters\n\n')
    f.write(str(grid_search_dtc.best_params_) + '\n')

grid_search_dtc.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'splitter': 'random'}

In [13]:
random_grid_search_dtc = RandomizedSearchCV(dtc, param_grid_dtc, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1, random_state=42, n_iter=10)
random_grid_search_dtc.fit(X_train, y_train)

In [14]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Decision Tree Best Score\n\n')
    f.write(str(random_grid_search_dtc.best_score_) + '\n')

random_grid_search_lr.best_score_

0.8101950354609929

In [15]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Decision Tree Best Parameters\n\n')
    f.write(str(random_grid_search_dtc.best_params_) + '\n')

random_grid_search_dtc.best_params_

{'splitter': 'random',
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_depth': 10,
 'criterion': 'entropy'}

In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

param_grid_rfc = [{
    'n_estimators': [500, 1000, 1500],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4]
}]

grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=2, scoring='accuracy', return_train_score=True, n_jobs=-1)
grid_search_rfc.fit(X_train, y_train)

In [17]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Random Forest Best Score\n\n')
    f.write(str(grid_search_rfc.best_score_) + '\n')

grid_search_rfc.best_score_

0.8101410055547643

In [18]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Random Forest Best Parameters\n\n')
    f.write(str(grid_search_rfc.best_params_) + '\n')

grid_search_rfc.best_params_

{'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 500}

In [19]:
random_param_grid_rfc = [{
    'n_estimators': [500, 1000, 1500],
    'criterion': ['entropy', 'gini'],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4]
}]

random_grid_search_rfc = RandomizedSearchCV(rfc, random_param_grid_rfc, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1, random_state=42, n_iter=10)
random_grid_search_rfc.fit(X_train, y_train)

In [20]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Random Forest Best Score\n\n')
    f.write(str(random_grid_search_rfc.best_score_) + '\n')

random_grid_search_rfc.best_score_

0.81427304964539

In [21]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Random Forest Best Parameters\n\n')
    f.write(str(random_grid_search_rfc.best_params_) + '\n')

random_grid_search_rfc.best_params_

{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'criterion': 'gini'}

In [22]:
from sklearn.svm import SVC

svm = SVC()

param_grid_svm = [{
    'C' : [1.0, 2.0, 3.0],
    'kernel' : ['rbf', 'sigmoid'],
    'degree' : [3, 6, 9],
    'gamma' : ['scale', 'auto'],
    'coef0' : [0.0,10.0,100.0],
    'tol' : [0.001,0.01,0.1],
    'cache_size' : [200,400,800]
}]

grid_search_svm = GridSearchCV(svm,param_grid_svm,cv=2,scoring='accuracy', return_train_score=True, n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

In [23]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Support Vector Machines Best Score\n\n')
    f.write(str(grid_search_svm.best_score_) + '\n')

grid_search_svm.best_score_

0.8101053980914399

In [24]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tGrid Search CV : Support Vector Machines Best Parameters\n\n')
    f.write(str(grid_search_svm.best_params_) + '\n')

grid_search_svm.best_params_

{'C': 2.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'tol': 0.001}

In [25]:
random_param_grid_svm = [{
    'C' : [1.0, 2.0, 3.0],
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree' : [3, 6, 9],
    'gamma' : ['scale', 'auto'],
    'coef0' : [0.0,10.0,100.0],
    'tol' : [0.001,0.01,0.1],
    'cache_size' : [200,400,800]
}]

random_grid_search_svm = RandomizedSearchCV(svm,random_param_grid_svm,cv=5,scoring='accuracy', return_train_score=True, n_jobs=-1,random_state=42,n_iter=10)
random_grid_search_svm.fit(X_train, y_train)

In [26]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Support Vector Machines Best Score\n\n')
    f.write(str(random_grid_search_svm.best_score_) + '\n')

random_grid_search_svm.best_score_

0.8187056737588654

In [27]:
with open('../results/evaluation_metrics.txt','a') as f:
    f.write('\t\t\tRandomized Search CV : Support Vector Machines Best Parameters\n\n')
    f.write(str(random_grid_search_svm.best_params_) + '\n')

random_grid_search_svm.best_params_

{'tol': 0.001,
 'kernel': 'linear',
 'gamma': 'auto',
 'degree': 6,
 'coef0': 100.0,
 'cache_size': 400,
 'C': 2.0}

In [28]:
df_2 = pd.read_csv('../data/heart_disease.csv')
X_2 = df_2.iloc[:,0:13]
y_2 = df_2.iloc[:,13]

X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

In [29]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.feature_selection import RFE

cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']

def drop_nulls(df) :
    df.dropna(inplace=True)

pipeline1 = Pipeline(steps=[
  #  ('Drop Nulls', FunctionTransformer(drop_nulls)),
    ('Scaling', MinMaxScaler()),
    ('Feature Selection', RFE(estimator=RandomForestClassifier(),n_features_to_select=5, step=1)),
])

col_transformer = ColumnTransformer(transformers=[('Pipeline',pipeline1,cols)], remainder='drop', n_jobs=-1)

pipeline_lr = make_pipeline(col_transformer, random_grid_search_lr.best_estimator_)
pipeline_dtc = make_pipeline(col_transformer, grid_search_dtc.best_estimator_)
pipeline_rfc = make_pipeline(col_transformer, random_grid_search_rfc.best_estimator_)
pipeline_svm = make_pipeline(col_transformer, random_grid_search_svm.best_estimator_)

In [30]:
pipeline_lr.fit(X_2_train, y_2_train)

In [31]:
pipeline_dtc.fit(X_2_train, y_2_train)

In [32]:
pipeline_rfc.fit(X_2_train, y_2_train)

In [33]:
pipeline_svm.fit(X_2_train, y_2_train)

In [34]:
import joblib

models = {
    'logistec_randomized' : pipeline_lr,
    'decision_tree_grid' : pipeline_dtc,
    'random_forest_randomized' : pipeline_rfc,
    'support_vectors_randomized' : pipeline_svm
}

with open('../models/final_model.joblib', 'wb') as joblib_file : 
    joblib.dump(models, joblib_file)