# Da best heart disease classifier in town
- 13516083 / Abram Perdanaputra
- 13516090 / Timothy Thamrin Andrew Hamonangan Sihombing
- 13516093 / Muhammad Farhan
- 13516153 / Dimas Aditia Pratikto
- 13516155 / Restu Wahyu Kartiko

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

import matplotlib.pyplot as plt
import numpy as np
import copy
%matplotlib inline

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

## Read dataset

Now we're going to read the `tubes2_HeartDisease_train` and `tubes2_HeartDisease_test`

In [None]:
def fix_data(data):
    """Convert dataframe to appropriate types"""
    data.loc[data['Column3'] == 1, 'Column3'] = 'typical_agina'
    data.loc[data['Column3'] == 2, 'Column3'] = 'atypical_agina'
    data.loc[data['Column3'] == 3, 'Column3'] = 'non_aginal_pain'
    data.loc[data['Column3'] == 4, 'Column3'] = 'asymtotic'

    data.loc[data['Column7'] == '0', 'Column7'] = 'normal'
    data.loc[data['Column7'] == '1', 'Column7'] = 'having ST-T wave abnormality'
    data.loc[data['Column7'] == '2', 'Column7'] = 'left ventricular hyperthrophy'

    data.loc[data['Column11'] == '1', 'Column11'] = 'upsloping'
    data.loc[data['Column11'] == '2', 'Column11'] = 'flat'
    data.loc[data['Column11'] == '3', 'Column11'] = 'downsloping'

    data.loc[data['Column13'] == '3', 'Column13'] = 'normal'
    data.loc[data['Column13'] == '6', 'Column13'] = 'fixed_defect'
    data.loc[data['Column13'] == '7', 'Column13'] = 'reversable_defect'
    
    data.Column4 = pd.to_numeric(data.Column4, errors='coerce')
    data.Column5 = pd.to_numeric(data.Column5, errors='coerce')
    data.Column6 = pd.to_numeric(data.Column6, errors='coerce')
    data.Column8 = pd.to_numeric(data.Column8, errors='coerce')
    data.Column9 = pd.to_numeric(data.Column9, errors='coerce')
    data.Column10 = pd.to_numeric(data.Column10, errors='coerce')
    data.Column12 = pd.to_numeric(data.Column12, errors='coerce')
    return data

In [None]:
heart_disease = {}
heart_disease['columns_detail'] = [
    'Age', 
    'Sex', 
    'Pain type', 
    'Blood pressure', 
    'Serum cholesterol', 
    'Fasting blood sugar > 120mg/dl', 
    'Resting ECG', 
    'Max heart rate achieved', 
    'exercise induced agina', 
    'ST depression induced by exercise relative to rest', 
    'Peak exercise ST segment', 
    'Number of major vessels colored by flourosopy', 
    'Thal', 
    'Diagnosis'
]
heart_disease['train'] = pd.read_csv('../data/tubes2_HeartDisease_train.csv')
heart_disease['test'] = pd.read_csv('../data/tubes2_HeartDisease_test.csv')

In [None]:
heart_disease['train'].head()

## Data preprocessing

In [None]:
# Helpers
def preprocess_data(train_data):
    data = copy.deepcopy(train_data)
    data.Column4 = pd.to_numeric(data.Column4, errors='coerce')
    data.Column5 = pd.to_numeric(data.Column5, errors='coerce')
    data.Column6 = pd.to_numeric(data.Column6, errors='coerce')
    data.Column8 = pd.to_numeric(data.Column8, errors='coerce')
    data.Column9 = pd.to_numeric(data.Column9, errors='coerce')
    data.Column10 = pd.to_numeric(data.Column10, errors='coerce')
#     data.Column12 = pd.to_numeric(data.Column12, errors='coerce')
    
    data.loc[data['Column3'] == 1, 'Column3'] = 'typical_agina'
    data.loc[data['Column3'] == 2, 'Column3'] = 'atypical_agina'
    data.loc[data['Column3'] == 3, 'Column3'] = 'non_aginal_pain'
    data.loc[data['Column3'] == 4, 'Column3'] = 'asymtotic'
    
    data.loc[data['Column4'].isnull(), 'Column4'] = data['Column4'].mean()
    data.loc[data['Column5'].isnull(), 'Column5'] = data['Column5'].mean()
    data.loc[data['Column6'].isnull(), 'Column6'] = 0

    data.loc[data['Column7'] == '0', 'Column7'] = 'normal'
    data.loc[data['Column7'] == '1', 'Column7'] = 'having ST-T wave abnormality'
    data.loc[data['Column7'] == '2', 'Column7'] = 'left ventricular hyperthrophy'
    data.loc[data['Column7'] == '?', 'Column7'] = 'normal'

    data.loc[data['Column8'].isnull(), 'Column8'] = 138.348299
    data.loc[data['Column9'].isnull(), 'Column9'] = 0.0
    data.loc[data['Column10'].isnull(), 'Column10'] = 3.937397

    data.loc[data['Column11'] == '1', 'Column11'] = 'upsloping'
    data.loc[data['Column11'] == '2', 'Column11'] = 'flat'
    data.loc[data['Column11'] == '3', 'Column11'] = 'downsloping'
    data.loc[data['Column11'] == '?', 'Column11'] = 'flat'
    
#     data.loc[data['Column12'].isnull(), 'Column12'] = 0.686792

#     data.loc[data['Column13'] == '3', 'Column13'] = 'normal'
#     data.loc[data['Column13'] == '6', 'Column13'] = 'fixed_defect'
#     data.loc[data['Column13'] == '7', 'Column13'] = 'reversable_defect'

    for i, row in data.iterrows():
        if row['Column11'] == '?':
            data.loc[i, 'Column11'] = np.random.choice(['upsloping', 'flat', 'downsloping'], \
                                               size=1,\
                                               p=[0.5570599613152805, 0.3404255319148936, 0.10251450676982592])[0]
        
#         if row['Column13'] == '?':
#             data.loc[i, 'Column13'] = np.random.choice(['normal', 'reversable_defect', 'fixed_defect'], \
#                                                size=1,\
#                                                p=[0.46630727762803237, 0.42857142857142855, 0.10512129380053908])[0]
#         if np.isnan(row['Column12']):
#             data.loc[i, 'Column12'] = np.random.choice([0, 1, 2, 3], \
#                                                size=1,\
#                                                p=[0.5773584905660377, 0.22264150943396227, 0.13584905660377358, 0.06415094339622641])[0]
    return data

In [None]:
heart_disease['train'].replace([np.inf, -np.inf], np.nan)
heart_disease['train'] = preprocess_data(heart_disease['train'])

In [None]:
heart_disease['train']['Column14'].value_counts().plot.bar()

In [None]:
heart_disease['train'] = pd.get_dummies(heart_disease['train'])
len(heart_disease['train'].columns)

## Experiment

In [None]:
cv = 5

### K-Nearest Neighbor

Hyperparameters

In [None]:
num_neighbor = 5

Preprocess

F1 Score

In [None]:
KNN = KNeighborsClassifier(n_neighbors=num_neighbor, algorithm='ball_tree')
f1_scores = cross_val_score(KNN, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
accuracy_scores = cross_val_score(KNN, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='accuracy')
print("F1 Score: {:.4f} +- {:.4f}".format(f1_scores.mean(), f1_scores.std()))
print("Accuracy Score: {:.4f} +- {:.4f}".format(accuracy_scores.mean(), accuracy_scores.std()))

### Naive Bayes

In [None]:
nb_df = fix_data(pd.read_csv('../data/tubes2_HeartDisease_train.csv'))

Hyperparameters

Preprocess

In [None]:
import numbers

def check_null(row):
    sum = 0
    for column in row:
        if isinstance(column, numbers.Number) and np.isnan(column):
            sum += 1
        if not(isinstance(column, numbers.Number)) and column == '?':
            sum += 1
    return sum

def check_outlier(row):
    outlier = False
    for column in row:
        if isinstance(column, numbers.Number) and np.isnan(column):
            sum += 1
        if not(isinstance(column, numbers.Number)) and column == '?':
            sum += 1
    return sum

def nb_preprocess(df):
    # dropping bad columns
    df = df.drop(['Column13', 'Column12'], axis=1)
    
    # dropping null values
    null_array = []
    for i, row in df.iterrows():
        if check_null(row) > 4:
            null_array.append(i)
            
    null_array.reverse()
    for i in null_array:    
        df = df.drop(df.index[i])
    
    # remove outliers
    
    
    return df

In [None]:
test = nb_preprocess(nb_df)
df = pd.get_dummies(preprocess_data(test))

F1 Score

In [None]:
gnb = GaussianNB()
df.loc[:, df.columns != 'b']
gnb_f1_scores = cross_val_score(gnb, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
gnb_accuracy_scores = cross_val_score(gnb, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='accuracy')
print("F1 Score: {:.4f} +- {:.4f}".format(gnb_f1_scores.mean(), gnb_f1_scores.std()))
print("Accuracy Score: {:.4f} +- {:.4f}".format(gnb_accuracy_scores.mean(), gnb_accuracy_scores.std()))

### Decision Tree (ID3)

In [None]:
id3_df = fix_data(pd.read_csv('../data/tubes2_HeartDisease_train.csv'))

In [None]:
pp_df = nb_preprocess(id3_df)
df = pd.get_dummies(preprocess_data(pp_df))
continuous = ['Column1', 'Column4', 'Column5', 'Column8', 'Column10']

In [None]:
df.head()

In [None]:
# for column in continuous:
#     df[column+"_log"]=np.log(df[column]+np.min(df[column])+1)

In [None]:
df.head()

In [None]:
df.dropna(axis=1,inplace=True)

In [None]:
df.isna().sum()

F1 Score

In [None]:
from math import sqrt

id3 = DecisionTreeRegressor(random_state=rand_state)
df.loc[:, df.columns != 'b']
id3_f1_scores = cross_val_score(tree, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
id3_accuracy_scores = cross_val_score(id3, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='accuracy')
print("F1 Score: {:.4f} +- {:.4f}".format(id3_f1_scores.mean(), id3_f1_scores.std()))
print("Accuracy Score: {:.4f} +- {:.4f}".format(id3_accuracy_scores.mean(), id3_accuracy_scores.std()))

In [None]:
poly=PolynomialFeatures(2)
X_train,X_test,y_train,y_test=train_test_split(poly.fit_transform(df.drop('Column14', axis=1)), df.Column14,test_size=0.2)
id3.fit(X_train,y_train)

parameters = {'max_depth':[3,6,9,12,15], 'min_samples_split':[2,4,8,16], 'min_samples_leaf':[1,2,4,8,16], 'max_features':[5,10,15]}
grid_search = GridSearchCV(id3, parameters, cv=5,verbose=3)
grid_search.fit(X_train,y_train)


print(id3.score(X_test,y_test))
print(grid_search.score(X_test,y_test))

In [None]:
parameters = {'max_depth':[3,6,9,12,15], 'min_samples_split':[2,4,8,16], 'min_samples_leaf':[1,2,4,8,16], 'max_features':[10,15,20,25,27]}
grid_search = GridSearchCV(id3, parameters, cv=5,verbose=3)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
tree=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=18, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [None]:
plot_learning_curve(tree, "Plot", df.drop("Column14",axis=1), df['Column14'])

In [None]:
poly=PolynomialFeatures(4)

In [None]:
plot_learning_curve(tree, "Plot", poly.fit_transform(df.drop("Column14",axis=1)), df['Column14'])

In [None]:
heart_disease["train"].head()
continuous=[""]

In [None]:
print(classification_report(id3.predict(X_test),y_test))

### Multilayer Perceptron

Hyperparameters

Preprocess

F1 Score

In [None]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
df.loc[:, df.columns != 'b']
mlp_f1_scores = cross_val_score(mlp, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
print("F1 Score: {} +- {}".format(mlp_f1_scores.mean(), mlp_f1_scores.std()))