In [315]:
# Regular Imports
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Get Data and get it ready for training

In [316]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
dataset = pd.read_csv(url, delimiter=",", header=None)
dataset.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
dataset.where(dataset != '?', np.nan, inplace=True)
dataset['target'] = dataset['target'].map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})
null_columns = dataset.isna().sum()
for col_name, num_null_values in null_columns.iteritems():
  if (num_null_values != 0):
    dataset[col_name].fillna(dataset[col_name].mode()[0], inplace=True)
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

MLP Classifier with 10 hidden layers, learning rate 0.1 and 10000 iteration

In [317]:
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier(hidden_layer_sizes=(10,), max_iter=10000, learning_rate_init=0.1)
pred_mlpc = mlpc.fit(X_train, y_train).predict(X_test)

Random Forest Classifier

In [318]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
pred_rfc = rfc.fit(X_train, y_train).predict(X_test)

Naive Bayes

In [319]:
from sklearn.naive_bayes import GaussianNB
nbc = GaussianNB()
pred_nbc = nbc.fit(X_train, y_train).predict(X_test)

Decison Tree

In [320]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
pred_dtc = dtc.fit(X_train, y_train).predict(X_test)

SVM Linear Kernel

In [321]:
from sklearn.svm import SVC
svc_l = SVC(kernel="linear", C=10, gamma=0.1)
pred_svc_l = svc_l.fit(X_train, y_train).predict(X_test)

SVM RBF Kernel

In [322]:
from sklearn.svm import SVC
svc_r = SVC(kernel="rbf", C=10, gamma=0.1)
pred_svc_r = svc_r.fit(X_train, y_train).predict(X_test)

Fuzzy kNN

In [323]:
import operator
from sklearn.base import BaseEstimator, ClassifierMixin
class FuzzyKNN(BaseEstimator, ClassifierMixin):
  def __init__(self, k=3, plot=False):
    self.k = k
    self.plot = plot
  def fit(self, X, y=None):
    self._check_params(X,y)
    self.X = X
    self.y = y
    self.xdim = len(self.X[0])
    self.n = len(y)
    classes = list(set(y))
    classes.sort()
    self.classes = classes
    self.df = pd.DataFrame(self.X)
    self.df['y'] = self.y
    self.memberships = self._compute_memberships()
    self.df['membership'] = self.memberships
    self.fitted_ = True
    return self
  def predict(self, X):
    if self.fitted_ == None:
      raise Exception('predict() called before fit()')
    else:
      m = 2
      y_pred = []
      for x in X:
        neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
        votes = {}
        for c in self.classes:
          den = 0
          for n in range(self.k):
            dist = np.linalg.norm(x - neighbors.iloc[n,0:self.xdim])
            den += 1 / (dist ** (2 / (m-1)))
          neighbors_votes = []
          for n in range(self.k):
            dist = np.linalg.norm(x - neighbors.iloc[n,0:self.xdim])
            num = (neighbors.iloc[n].membership[c]) / (dist ** (2 / (m-1)))
            vote = num/den
            neighbors_votes.append(vote)
          votes[c] = np.sum(neighbors_votes)
        pred = max(votes.items(), key=operator.itemgetter(1))[0]
        # y_pred.append((pred, votes))
        y_pred.append(pred)
      return y_pred
  def score(self, X, y):
    if self.fitted_ == None:
      raise Exception('score() called before fit()')
    else:
      predictions = self.predict(X)
      y_pred = [t[0] for t in predictions]
      confidences = [t[1] for t in predictions]
      return accuracy_score(y_pred=y_pred, y_true=y)
  def _find_k_nearest_neighbors(self, df, x):
    X = df.iloc[:,0:self.xdim].values
    df['distances'] = [np.linalg.norm(X[i] - x) for i in range(self.n)]
    df.sort_values(by='distances', ascending=True, inplace=True)
    neighbors = df.iloc[0:self.k]
    return neighbors
  def _get_counts(self, neighbors):
    groups = neighbors.groupby('y')
    counts = {group[1]['y'].iloc[0]:group[1].count()[0] for group in groups}
    return counts
  def _compute_memberships(self):
    memberships = []
    # for i in range(self.n):
    for i, j in zip(self.X, self.y):
      # x = self.X[i]
      # Y = self.y[i]
      x = i
      Y = j
      neighbors = self._find_k_nearest_neighbors(pd.DataFrame.copy(self.df), x)
      counts = self._get_counts(neighbors)
      membership = dict()
      for c in self.classes:
        try:
          uci = 0.49 * (counts[c] / self.k)
          if c == Y:
            uci += 0.51
          membership[c] = uci
        except:
          membership[c] = 0
      memberships.append(membership)
    return memberships
  def _check_params(self, X, y):
    if type(self.k) != int:
      raise Exception('"k" should have type int')
    if self.k >= len(y):
      raise Exception('"k" should be less than no of feature sets')
    if self.k % 2 == 0:
      raise Exception('"k" should be odd')
    if type(self.plot) != bool:
      raise Exception('"plot" should have type bool')

fKNN = FuzzyKNN(k=19)
pred_fKNN = fKNN.fit(X_train, y_train).predict(X_test)

Gradint Boosting Decision Tree Logistic Regression

In [324]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(
    n_estimators=10,
    max_depth=5,
    min_samples_leaf=2,
).fit(X_train, y_train)

X_Coded = OneHotEncoder().fit_transform(gbc.apply(X)[:, :, 0])
X_train_hot_coded, X_test_hot_coded, y_train, y_test = train_test_split(X_Coded, y, test_size=0.2, random_state=42)
gbclr = LogisticRegression(penalty='l2', random_state=42).fit(X_train_hot_coded, y_train)
pred_gbclr = gbclr.predict(X_test_hot_coded)

gbclr_alt = LogisticRegression(penalty='l2', random_state=42).fit(gbc.apply(X_train)[:, :, 0], y_train)
pred_gbclr_alt = gbclr_alt.predict(gbc.apply(X_test)[:, :, 0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


HRFLM

In [325]:
from copy import deepcopy
from sklearn.metrics import mean_squared_error
class ModelTree(object):
    def __init__(self, model, max_depth=5, min_samples_leaf=10):
        self.model = model
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = None
    def fit(self, X, y, verbose=False):
        # Settings
        model = self.model
        min_samples_leaf = self.min_samples_leaf
        max_depth = self.max_depth
        if verbose:
            print(" max_depth={}, min_samples_leaf={}...".format(max_depth, min_samples_leaf))
        def _build_tree(X, y):
            global index_node_global
            def _create_node(X, y, depth, container):
                loss_node, model_node = _fit_model(X, y, model)
                node = {"name": "node",
                        "index": container["index_node_global"],
                        "loss": loss_node,
                        "model": model_node,
                        "data": (X, y),
                        "n_samples": len(X),
                        "j_feature": None,
                        "threshold": None,
                        "children": {"left": None, "right": None},
                        "depth": depth}
                container["index_node_global"] += 1
                return node
            # Recursively split node + traverse node until a terminal node is reached
            def _split_traverse_node(node, container):
                # Perform split and collect result
                result = _splitter(node, model, max_depth=max_depth,min_samples_leaf=min_samples_leaf)
                # Return terminal node if split is not advised
                if not result["did_split"]:
                    if verbose:
                        depth_spacing_str = " ".join([" "] * node["depth"])
                        print(" {}*leaf {} @ depth {}: loss={:.6f}, N={}".format(depth_spacing_str, node["index"], node["depth"], node["loss"], result["N"]))
                    return

                # Update node information based on splitting result
                node["j_feature"] = result["j_feature"]
                node["threshold"] = result["threshold"]
                del node["data"]  # delete node stored data

                # Extract splitting results
                (X_left, y_left), (X_right, y_right) = result["data"]
                model_left, model_right = result["models"]

                # Report created node to user
                if verbose:
                    depth_spacing_str = " ".join([" "] * node["depth"])
                    print(" {}node {} @ depth {}: loss={:.6f}, j_feature={}, threshold={:.6f}, N=({},{})".format(depth_spacing_str, node["index"], node["depth"], node["loss"], node["j_feature"], node["threshold"], len(X_left), len(X_right)))

                # Create children nodes
                node["children"]["left"] = _create_node(X_left, y_left, node["depth"]+1, container)
                node["children"]["right"] = _create_node(X_right, y_right, node["depth"]+1, container)
                node["children"]["left"]["model"] = model_left
                node["children"]["right"]["model"] = model_right

                # Split nodes
                _split_traverse_node(node["children"]["left"], container)
                _split_traverse_node(node["children"]["right"], container)

            container = {"index_node_global": 0}  # mutatable container
            root = _create_node(X, y, 0, container)  # depth 0 root node
            _split_traverse_node(root, container)  # split and traverse root node

            return root

        # Construct tree
        self.tree = _build_tree(X, y)
        return self.tree
    # ======================
    # Predict
    # ======================
    def predict(self, X):
        assert self.tree is not None
        def _predict(node, x):
            no_children = node["children"]["left"] is None and \
                          node["children"]["right"] is None
            if no_children:
                y_pred_x = node["model"].predict([x])[0]
                return y_pred_x
            else:
                if x[node["j_feature"]] <= node["threshold"]:  # x[j] < threshold
                    return _predict(node["children"]["left"], x)
                else:  # x[j] > threshold
                    return _predict(node["children"]["right"], x)
        y_pred = np.array([_predict(self.tree, x) for x in X])
        return y_pred
    # ======================
    # Loss
    # ======================
    def loss(self, X, y, y_pred):
        loss = self.model.loss(X, y, y_pred)
        return loss
def _splitter(node, model,max_depth=5, min_samples_leaf=10):
    # Extract data
    X, y = node["data"]
    depth = node["depth"]
    N, d = X.shape

    # Find feature splits that might improve loss
    did_split = False
    loss_best = node["loss"]
    data_best = None
    models_best = None
    j_feature_best = None
    threshold_best = None

    # Perform threshold split search only if node has not hit max depth
    if (depth >= 0) and (depth < max_depth):
        for j_feature in range(d):
            threshold_search = []
            for i in range(N):
                threshold_search.append(X[i, j_feature])
            # Perform threshold split search on j_feature
            for threshold in threshold_search:
                # Split data based on threshold
                (X_left, y_left), (X_right, y_right) = _split_data(j_feature, threshold, X, y)
                N_left, N_right = len(X_left), len(X_right)

                # Splitting conditions
                split_conditions = [N_left >= min_samples_leaf,
                                    N_right >= min_samples_leaf]

                # Do not attempt to split if split conditions not satisfied
                if not all(split_conditions):
                    continue

                # Compute weight loss function
                loss_left, model_left = _fit_model(X_left, y_left, model)
                loss_right, model_right = _fit_model(X_right, y_right, model)
                loss_split = (N_left*loss_left + N_right*loss_right) / N

                # Update best parameters if loss is lower
                if loss_split < loss_best:
                    did_split = True
                    loss_best = loss_split
                    models_best = [model_left, model_right]
                    data_best = [(X_left, y_left), (X_right, y_right)]
                    j_feature_best = j_feature
                    threshold_best = threshold

    # Return the best result
    result = {"did_split": did_split,
              "loss": loss_best,
              "models": models_best,
              "data": data_best,
              "j_feature": j_feature_best,
              "threshold": threshold_best,
              "N": N}

    return result

def _fit_model(X, y, model):
    model_copy = deepcopy(model)  # must deepcopy the model!
    model_copy.fit(X,y)
    y_pred = model_copy.predict(X)
    loss = model_copy.loss(X, y, y_pred)
    assert loss >= 0.0
    return loss, model_copy

def _split_data(j_feature, threshold, X, y):
    idx_left = np.where(X[:, j_feature] <= threshold)[0]
    idx_right = np.delete(np.arange(0, len(X)), idx_left)
    assert len(idx_left) + len(idx_right) == len(X)
    idx_left_bool = [False for i in X]
    idx_right_bool = [False for i in X]
    for i in idx_left:
      idx_left_bool[i] = True
    for i in idx_right:
      idx_right_bool[i] = True
    return (X[idx_left_bool], y[idx_left_bool]), (X[idx_right_bool], y[idx_right_bool])

class logistic_regr:

    def __init__(self):
        from sklearn.linear_model import LogisticRegression
        self.model = LogisticRegression(penalty="l2",solver='liblinear')
        self.flag = False
        self.flag_y_pred = None

    def fit(self, X, y):
        y_unique = list(set(y))
        if len(y_unique) == 1:
            self.flag = True
            self.flag_y_pred = y_unique[0]
        else:
            self.model.fit(X, y)

    def predict(self, X):
        if self.flag:
            return self.flag_y_pred * np.ones((len(X),), dtype=int)
        else:
            return self.model.predict(X)

    def loss(self, X, y, y_pred):
        return mean_squared_error(y, y_pred)

    def predict_proba(self,X):
        return self.model.predict_proba(X)


#No of decisioin Trees
esitmators=5
pred_HRFLM=[]
n_train_split=int(len(X_train)/esitmators)
inital_train=0
final_train=0
yy_pred=[]
classifier=None
for i in range(1,esitmators+1):
    classifier =logistic_regr()
    final_train=i*n_train_split
    temp_X_train=X_train[inital_train:final_train]
    temp_y_train=y_train[inital_train:final_train]
    L=ModelTree(classifier,max_depth=20, min_samples_leaf=10)
    node=L.fit(temp_X_train,temp_y_train,verbose=False)
    classifier=node["model"]
    y_pred_temp=L.predict(X_test)
    yy_pred.append(y_pred_temp)
for j in range(len(yy_pred[0])):
    curr=[]
    for i in range(len(yy_pred)):
        curr.append(yy_pred[i][j])
    a=curr.count(0)
    b=curr.count(1)
    if a>b:
        pred_HRFLM.append(0)
    else:
        pred_HRFLM.append(1)


SVM + Random Forest

In [326]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(estimator=RandomForestClassifier()).fit(X_train, y_train)
X_train_new = sfm.transform(X_train)
X_test_new = sfm.transform(X_test)
svcrf_r = SVC(kernel="rbf", C=10, gamma=0.1)
pred_svcrf_r = svcrf_r.fit(X_train_new, y_train).predict(X_test_new)

svcrf_l = SVC(kernel="linear")
pred_svcrf_l = svcrf_l.fit(X_train_new, y_train).predict(X_test_new)

CNN

In [327]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Input, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
X_train_reshaped = X_train.reshape(len(X_train), len(X_train[0]), 1)
X_test_reshaped = X_test.reshape(len(X_test), len(X_test[0]), 1)
model = Sequential()
model.add(Input(shape=(13, 1)))
model.add(Dense(units=128, activation='relu'))
model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling1D(pool_size=(1)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.fit(X_train_reshaped, y_train, batch_size=32, epochs=100, verbose=0)
loss, acc_CNN = model.evaluate(X_test_reshaped, y_test)
# pred_CNN = model.predict(X_test_reshaped).flatten()
# pred_CNN = (pred_CNN > 0.5)



LR with Anomaly Detection

In [328]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import LocalOutlierFactor
# from sklearn.cluster import KMeans
# from copy import deepcopy
# km = KMeans(n_clusters=2)
# km.fit(X)
# distances = km.transform(X)
# d1 = distances[:, 0]
# d2 = distances[:, 1]
# # sorted_idx = np.argsort(distances.ravel())[::-1][:5]
# s1 = np.argsort(d1)[::-1][:5]
# s2 = np.argsort(d2)[::-1][:5]
# s = list(set(np.concatenate([s1, s2])))
# t = deepcopy(s)
# for i in s:
#   print (i, y[i])
# new_X = np.delete(X, s, axis=0)
# new_y = np.delete(y, t, axis=0)
iso = LocalOutlierFactor()
yhat = iso.fit_predict(X_train)
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
lr = LogisticRegression()
pred_lr = lr.fit(X_train, y_train).predict(X_test)

Accuracies of different methods

In [329]:
print ("MLP:                  ", accuracy_score(y_test, pred_mlpc))
print ("Random Forest:        ", accuracy_score(y_test, pred_rfc))
print ("Naive Bayes:          ", accuracy_score(y_test, pred_nbc))
print ("HRFLM:                ", accuracy_score(y_test, pred_HRFLM))
print ("GBCLR:                ", accuracy_score(y_test, pred_gbclr))
print ("GBCLR ALT:            ", accuracy_score(y_test, pred_gbclr_alt))
print ("Decision Tree:        ", accuracy_score(y_test, pred_dtc))
print ("SVM RBF:              ", accuracy_score(y_test, pred_svc_r))
print ("Fuzzy kNN:            ", accuracy_score(y_test, pred_fKNN))
print ("SVMRF (Our Method):   ", accuracy_score(y_test, pred_svcrf_r))
print ("LR Anomaly Detection: ", accuracy_score(y_test, pred_lr))
print ("CNN:                  ", acc_CNN)

MLP:                   0.7868852459016393
Random Forest:         0.8852459016393442
Naive Bayes:           0.8360655737704918
HRFLM:                 0.8852459016393442
GBCLR:                 0.8360655737704918
GBCLR ALT:             0.8360655737704918
Decision Tree:         0.7213114754098361
SVM Linear:            0.8688524590163934
SVM RBF:               0.8688524590163934
Fuzzy kNN:             0.9344262295081968
SVMRF Linear:          0.8688524590163934
SVMRF RBF:             0.9016393442622951
LR Anomaly Detection:  0.8852459016393442
CNN:                   0.7172130942344666
