In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib

In [91]:
df = pd.read_csv('../data/heart_disease.csv')
X = df.drop('target', axis=1)
y = df['target']

In [92]:
lr = joblib.load('../models/logistic_regression_model.pkl')
rf = joblib.load('../models/random_forest_classifier_model.pkl')
svm = joblib.load('../models/svm_model.pkl')
heirarchical = joblib.load('../models/heir_clustering_model.pkl')
kmeans = joblib.load('../models/kmeans_model.pkl')
dt = joblib.load('../models/decision_tree_classifier_model.pkl')

In [93]:
param_grid = {
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2']
    },
    'HeirarchicalClustering': {
        'n_clusters': [2, 3, 4, 5],
        'linkage': ['ward', 'complete', 'average', 'single']
    },
    'Kmeans': {
        'n_clusters': [2, 3, 4, 5],
        'init': ['k-means++', 'random'],
        'n_init': [10, 20, 30]
    },
    'DecisionTreeClassifier': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'SVC': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    }
}

In [94]:
search_lr = GridSearchCV(lr, param_grid['LogisticRegression'], cv=5, n_jobs=-1, scoring='accuracy')
search_lr.fit(X, y)
print('Logistic Regression:')
print(f'Best parameters: {search_lr.best_params_}')
print(f'Best score: {search_lr.best_score_}')

Logistic Regression:
Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.6037704918032787


In [95]:
search_dt = GridSearchCV(dt, param_grid['DecisionTreeClassifier'], cv=5, n_jobs=-1, scoring='accuracy')
search_dt.fit(X, y)
print('Decision Tree Classifier:')
print(f'Best parameters: {search_dt.best_params_}')
print(f'Best score: {search_dt.best_score_}')

Decision Tree Classifier:
Best parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10}
Best score: 0.5213114754098361


In [96]:
search_rf = GridSearchCV(rf, param_grid['RandomForestClassifier'], cv=5, n_jobs=-1, scoring='accuracy')
search_rf.fit(X, y)
print('Random Forest Classifier:')
print(f'Best parameters: {search_rf.best_params_}')
print(f'Best score: {search_rf.best_score_}')

Random Forest Classifier:
Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.6004371584699453


In [97]:
search_svm = GridSearchCV(svm, param_grid['SVC'], cv=5, n_jobs=-1, scoring='accuracy')
search_svm.fit(X, y)
print('SVM:')
print(f'Best parameters: {search_svm.best_params_}')
print(f'Best score: {search_svm.best_score_}')

SVM:
Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'poly'}
Best score: 0.6006557377049181


In [98]:
search_hc = GridSearchCV(heirarchical, param_grid['HeirarchicalClustering'], cv=5, n_jobs=-1, scoring='accuracy')
search_hc.fit(X)
print('Heirarchical Clustering:')
print(f'Best parameters: {search_hc.best_params_}')

Heirarchical Clustering:
Best parameters: {'linkage': 'ward', 'n_clusters': 2}




In [99]:
search_kmeans = GridSearchCV(kmeans, param_grid['Kmeans'], cv=5, n_jobs=-1, scoring='accuracy')
search_kmeans.fit(X)
print('KMeans:')
print(f'Best parameters: {search_kmeans.best_params_}')

KMeans:
Best parameters: {'init': 'k-means++', 'n_clusters': 2, 'n_init': 10}


 nan nan nan nan nan nan]


In [100]:
best_model = search_svm.best_estimator_
joblib.dump(best_model, '../models/best_model.pkl')

['../models/best_model.pkl']