In [1]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from utils import *
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
def ParameterTuning(x, y, clf, params, fold=5, repeat=100):
    
    selector = SelectPercentile()
    pipeline = Pipeline([('selector', selector), ('clf', clf)])
    params['selector__percentile'] = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    params['selector__score_func'] = [f_classif, chi2]
    #rskf = RepeatedStratifiedKFold(n_splits=fold, n_repeats=repeat, random_state=random.randint(1, 1000000))
    skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=random.randint(1, 100000))
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    
    results = GridSearchCV(pipeline, params, scoring=scoring, refit='accuracy', cv=skf)
    results.fit(x, y)
    
    return results

In [3]:
def Logistic_Regression(X, y):

    clf = LogisticRegression(solver='saga', max_iter=10000)
    params = {'clf__penalty': ['l2', 'l1'], 'clf__C': [0.01, 0.1, 1, 10]}
    x = MinMaxScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [4]:
def NaiveBayes(X, y):

    clf = GaussianNB()
    params = {}
    x = MinMaxScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [5]:
def KNN(X, y):
    
    clf = KNeighborsClassifier()
    params = {'clf__n_neighbors': [5, 10, 13, 20], 'clf__weights': ['uniform', 'distance'], 'clf__p': [1, 2]}
    x = MinMaxScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [6]:
def SVM(X, y):
    
    clf = SVC()
    params = {'clf__C': [0.1, 1, 5, 10], 'clf__kernel': ['linear', 'poly', 'rbf'],
              'clf__degree': [3, 4, 5, 6], 'clf__gamma': ['scale', 'auto']}
    x = StandardScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [7]:
def DTree(X, y):
    
    clf = DecisionTreeClassifier()
    params = {'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': [None, 9, 8, 7, 6, 5], 
              'clf__min_samples_leaf': [1, 5, 10, 20], 'clf__max_features': [None, 'auto', 9, 8, 7, 6, 5],
              'clf__ccp_alpha': [0, 0.01, 0.05, 0.1, 0.15, 0.2, 0.5]}
    x = StandardScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [8]:
# def RForest(X, y):
    
#     clf = RandomForestClassifier()
#     params = {'clf__n_estimators': [10, 25, 50, 100], 'clf__criterion': ['gini', 'entropy'], 
#               'clf__max_depth': [None, 9, 8, 7, 6, 5], 'clf__min_samples_leaf': [1, 5, 10, 20], 
#               'clf__max_features': ['auto', 9, 8, 7, 6, 5]}
#     x = StandardScaler().fit_transform(X)
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
#     res = ParameterTuning(x_train, y_train, clf, params)
#     y_pred_test = res.predict(x_test)
#     y_pred_train = res.predict(x_train)
#     acc_test = accuracy_score(y_test, y_pred_test)
#     acc_train = accuracy_score(y_train, y_pred_train)
    
#     return res, acc_train, acc_test

In [9]:
# def ANN(X, y):
    
#     clf = MLPClassifier(max_iter=10000)
#     params = {'clf__hidden_layer_sizes': [(100,), (10, 10), (10, 5, 2)], 
#               'clf__activation': ['identity', 'logistic', 'tanh', 'relu'], 'clf__solver': ['lbfgs', 'sgd', 'adam'], 
#               'clf__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
#               'clf__learning_rate': ['constant', 'invscaling', 'adaptive'], 'clf__early_stopping': [True, False]}
#     x = MinMaxScaler().fit_transform(X)
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
#     res = ParameterTuning(x_train, y_train, clf, params)
#     y_pred_test = res.predict(x_test)
#     y_pred_train = res.predict(x_train)
#     acc_test = accuracy_score(y_test, y_pred_test)
#     acc_train = accuracy_score(y_train, y_pred_train)
    
#     return res, acc_train, acc_test

In [10]:
def RForest(X, y):
    
    clf = RandomForestClassifier()
    params = {'clf__n_estimators': [10, 25, 50, 100], 'clf__criterion': ['gini', 'entropy'], 
              'clf__max_depth': [None, 9, 8, 7, 6, 5]}
    x = StandardScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [11]:
def ANN(X, y):
    
    clf = MLPClassifier(max_iter=10000)
    params = {'clf__hidden_layer_sizes': [(100,), (10, 10), (10, 5, 2)], 
              'clf__activation': ['relu'], 'clf__solver': ['adam'], 
              'clf__alpha': [0.0001, 0.001, 0.01], 'clf__early_stopping': [True, False]}
    x = MinMaxScaler().fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random.randint(1, 100000))
    res = ParameterTuning(x_train, y_train, clf, params)
    y_pred_test = res.predict(x_test)
    y_pred_train = res.predict(x_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    return res, acc_train, acc_test

In [12]:
x, y = load_dataset("heart_processed_noC.csv")

lr_res, lr_acc_tr, lr_acc_te = Logistic_Regression(x, y)
print('Logistic Regression Done!')

nb_res, nb_acc_tr, nb_acc_te = NaiveBayes(x, y)
print('Naive Bayes Done!')

knn_res, knn_acc_tr, knn_acc_te = KNN(x, y)
print('KNN Done!')

svm_res, svm_acc_tr, svm_acc_te = SVM(x, y)
print('SVM Done!')

dtree_res, dtree_acc_tr, dtree_acc_te = DTree(x, y)
print('Decision Tree Done!')

Logistic Regression Done!
Naive Bayes Done!
KNN Done!
SVM Done!
Decision Tree Done!


In [13]:
print('Logistic Regression:')
print('Train: {}%    Test: {}%'.format(round(lr_acc_tr*100, 2), round(lr_acc_te*100, 2)))
print(' ')
print('Naive Bayes:')
print('Train: {}%    Test: {}%'.format(round(nb_acc_tr*100, 2), round(nb_acc_te*100, 2)))
print(' ')
print('k-Nearest Neighbours:')
print('Train: {}%    Test: {}%'.format(round(knn_acc_tr*100, 2), round(knn_acc_te*100, 2)))
print(' ')
print('Support Vector Machines:')
print('Train: {}%    Test: {}%'.format(round(svm_acc_tr*100, 2), round(svm_acc_te*100, 2)))
print(' ')
print('Decision Tree:')
print('Train: {}%    Test: {}%'.format(round(dtree_acc_tr*100, 2), round(dtree_acc_te*100, 2)))
print(' ')

Logistic Regression:
Train: 84.47%    Test: 87.5%
 
Naive Bayes:
Train: 85.01%    Test: 84.24%
 
k-Nearest Neighbours:
Train: 89.1%    Test: 85.33%
 
Support Vector Machines:
Train: 89.37%    Test: 88.04%
 
Decision Tree:
Train: 85.69%    Test: 84.78%
 


In [21]:
print('Logistic Regression:')
print('Best Params: {}'.format(lr_res.best_params_))
print('Best Score: {}'.format(round(lr_res.best_score_*100, 2)))
print(' ')
print('Naive Bayes:')
print('Best Params: {}'.format(nb_res.best_params_))
print('Best Score: {}'.format(round(nb_res.best_score_*100, 2)))
print(' ')
print('k-Nearest Neighbours:')
print('Best Params: {}'.format(knn_res.best_params_))
print('Best Score: {}'.format(round(knn_res.best_score_*100, 2)))
print(' ')
print('Support Vector Machines:')
print('Best Params: {}'.format(svm_res.best_params_))
print('Best Score: {}'.format(round(svm_res.best_score_*100, 2)))
print(' ')
print('Decision Tree:')
print('Best Params: {}'.format(dtree_res.best_params_))
print('Best Score: {}'.format(round(dtree_res.best_score_*100, 2)))
print(' ')

Logistic Regression:
Best Params: {'clf__C': 10, 'clf__penalty': 'l2', 'selector__percentile': 60, 'selector__score_func': <function f_classif at 0x7fe19a8d38c0>}
Best Score: 84.46
 
Naive Bayes:
Best Params: {'selector__percentile': 70, 'selector__score_func': <function f_classif at 0x7fe19a8d38c0>}
Best Score: 84.74
 
k-Nearest Neighbours:
Best Params: {'clf__n_neighbors': 5, 'clf__p': 1, 'clf__weights': 'uniform', 'selector__percentile': 100, 'selector__score_func': <function f_classif at 0x7fe19a8d38c0>}
Best Score: 85.97
 
Support Vector Machines:
Best Params: {'clf__C': 1, 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'rbf', 'selector__percentile': 90, 'selector__score_func': <function f_classif at 0x7fe19a8d38c0>}
Best Score: 86.37
 
Decision Tree:
Best Params: {'clf__ccp_alpha': 0.01, 'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__max_features': 5, 'clf__min_samples_leaf': 1, 'selector__percentile': 90, 'selector__score_func': <function f_classif at 0x7fe19a8d38

In [15]:
x, y = load_dataset("heart_processed_noC.csv")

ann_res, ann_acc_tr, ann_acc_te = ANN(x, y)
print('ANN Done!')

ANN Done!


In [16]:
print('Artifical Neural Network:')
print('Train: {}%    Test: {}%'.format(round(ann_acc_tr*100, 2), round(ann_acc_te*100, 2)))
print(' ')

Artifical Neural Network:
Train: 87.19%    Test: 82.07%
 


In [22]:
print('Artifical Neural Network:')
print('Best Params: {}'.format(ann_res.best_params_))
print('Best Score: {}'.format(round(ann_res.best_score_*100, 2)))
print(' ')

Artifical Neural Network:
Best Params: {'clf__activation': 'relu', 'clf__alpha': 0.001, 'clf__early_stopping': False, 'clf__hidden_layer_sizes': (10, 5, 2), 'clf__solver': 'adam', 'selector__percentile': 80, 'selector__score_func': <function f_classif at 0x7fe19a8d38c0>}
Best Score: 87.06
 


In [18]:
rf_res, rf_acc_tr, rf_acc_te = RForest(x, y)
print('Random Forest Done!')

Random Forest Done!


In [19]:
print('Random Forest:')
print('Train: {}%    Test: {}%'.format(round(rf_acc_tr*100, 2), round(rf_acc_te*100, 2)))
print(' ')

Random Forest:
Train: 98.09%    Test: 84.78%
 


In [23]:
print('Random Forest:')
print('Best Params: {}'.format(rf_res.best_params_))
print('Best Score: {}'.format(round(rf_res.best_score_*100, 2)))
print(' ')

Random Forest:
Best Params: {'clf__criterion': 'gini', 'clf__max_depth': 9, 'clf__n_estimators': 50, 'selector__percentile': 100, 'selector__score_func': <function f_classif at 0x7fe19a8d38c0>}
Best Score: 87.46
 
