# Da best heart disease classifier in town
- 13516083 / Abram Perdanaputra
- 13516090 / Timothy Thamrin Andrew Hamonangan Sihombing
- 13516093 / Muhammad Farhan
- 13516153 / Dimas Aditia Pratikto
- 13516155 / Restu Wahyu Kartiko

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt
import numpy as np
import copy
import numbers

%matplotlib inline

#### Functions

In [None]:
# learning curve function

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

## Read dataset

Now we're going to read the `tubes2_HeartDisease_train` and `tubes2_HeartDisease_test`

In [None]:
def fix_data(data):
    """Convert dataframe to appropriate types"""
    for x in range(1,5):
        data.loc[data['Column3'] == x, 'Column3'] = str(x)

    num_col = [4, 5, 6, 8, 9, 10, 12]
    
    for col in num_col:
        col_name = 'Column'+str(col)
        data[col_name] = pd.to_numeric(data[col_name], errors='coerce')
    
    return data

def show_data(df, columns):
    data = copy.deepcopy(df)
    data.columns = columns
    
    return data

In [None]:
# data columns and read data from files

heart_disease = {}
heart_disease['columns_detail'] = [
    'Age', 
    'Sex', 
    'Pain type', 
    'Blood pressure', 
    'Serum cholesterol', 
    'Fasting blood sugar > 120mg/dl', 
    'Resting ECG', 
    'Max heart rate achieved', 
    'exercise induced agina', 
    'ST depression induced by exercise relative to rest', 
    'Peak exercise ST segment', 
    'Number of major vessels colored by flourosopy', 
    'Thal', 
    'Diagnosis'
]
heart_disease['train'] = pd.read_csv('../data/tubes2_HeartDisease_train.csv')
heart_disease['test'] = pd.read_csv('../data/tubes2_HeartDisease_test.csv')

In [None]:
heart_disease['train'] = fix_data(heart_disease['train'])
show_data(heart_disease['train'], heart_disease['columns_detail']).head()

In [None]:
coba = heart_disease['train']

## Data preprocessing

In [None]:
# Helpers
def preprocess_data(data):
    df = copy.deepcopy(data)
    
    # remove infinity and null
    df = df.replace([np.inf, -np.inf], np.nan)
    
    # dropping null values
    null_array = []
    for i, row in df.iterrows():
        if check_null(row) > 3:
            null_array.append(i)
            
    null_array.reverse()
    for i in null_array:
        df = df.drop(df.index[i])

    # remove outliers
    
    # fill null and nan with median
    df.loc[data['Column4'].isnull(), 'Column4'] = data['Column4'].median()
    df.loc[data['Column5'].isnull(), 'Column5'] = data['Column5'].median()
    df.loc[data['Column6'].isnull(), 'Column6'] = data['Column6'].median()
    df.loc[data['Column8'].isnull(), 'Column8'] = data['Column8'].median()
    df.loc[data['Column9'].isnull(), 'Column9'] = data['Column9'].median()
    df.loc[data['Column10'].isnull(), 'Column10'] = data['Column10'].median()
    df.loc[data['Column12'].isnull(), 'Column12'] = data['Column12'].median()
    
    df.loc[np.isnan(data['Column4']), 'Column4'] = data['Column4'].median()
    df.loc[np.isnan(data['Column5']), 'Column5'] = data['Column5'].median()
    df.loc[np.isnan(data['Column6']), 'Column6'] = data['Column6'].median()
    df.loc[np.isnan(data['Column8']), 'Column8'] = data['Column8'].median()
    df.loc[np.isnan(data['Column9']), 'Column9'] = data['Column9'].median()
    df.loc[np.isnan(data['Column10']), 'Column10'] = data['Column10'].median()
    df.loc[np.isnan(data['Column12']), 'Column12'] = data['Column12'].median()
    
    df.loc[data['Column7'] == '?', 'Column7'] = '0'
    df.loc[data['Column11'] == '?', 'Column11'] = '1'
    df.loc[data['Column13'] == '?', 'Column11'] = '0'
    
#     for i, row in df.iterrows():
#         if row['Column11'] == '?':
#             df.loc[i, 'Column11'] = np.random.choice(['1', '2', '3'], \
#                                                size=1,\
#                                                p=[0.5570599613152805, 0.3404255319148936, 0.10251450676982592])[0]
    
#     df.loc[:, 'Column1'] = (df['Column1'] - df['Column1'].mean()) / df['Column1'].std()
#     df.loc[:, 'Column4'] = (df['Column4'] - df['Column4'].mean()) / df['Column4'].std()
#     df.loc[:, 'Column5'] = (df['Column5'] - df['Column5'].mean()) / df['Column5'].std()
#     df.loc[:, 'Column8'] = (df['Column8'] - df['Column8'].mean()) / df['Column8'].std()
#     df.loc[:, 'Column10'] = (df['Column10'] - df['Column10'].mean()) / df['Column10'].std()
    
    
    # Duplicate targets
#     df = df.append([df[df['Column14'] == 4]]*3,ignore_index=True)
#     df = df.append([df[df['Column14'] == 3][:12]],ignore_index=True)
#     df = df.append([df[df['Column14'] == 2][:12]],ignore_index=True)
    
    # dropping bad columns
    df = df.drop(['Column12', 'Column13'], axis=1)
    
    df = pd.get_dummies(df)
    
    return df

def check_null(row):
    sum = 0
    for column in row:
        if isinstance(column, numbers.Number) and np.isnan(column):
            sum += 1
        if not(isinstance(column, numbers.Number)) and column == '?':
            sum += 1
        if column == None:
            sum += 1
    return sum

def check_outlier(row):
    outlier = False
    for column in row:
        if isinstance(column, numbers.Number) and np.isnan(column):
            sum += 1
        if not(isinstance(column, numbers.Number)) and column == '?':
            sum += 1
    return sum

In [None]:
df = preprocess_data(heart_disease['train'])

#### Data Distribution on Target

In [None]:
df['Column14'].value_counts()

## Experiment

In [None]:
cv = 5

### K-Nearest Neighbor

#### Hyperparameters

In [None]:
num_neighbor = 5

#### Preprocess

In [None]:
df.head()

#### F1 Score

In [None]:
knn = KNeighborsClassifier(n_neighbors=num_neighbor, algorithm='ball_tree')

# train
X_train, X_test, y_train, y_test = train_test_split(df.drop('Column14', axis=1), df.Column14,test_size=0.2)
knn.fit(X_train,y_train)

knn_f1_scores = cross_val_score(knn, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
knn_accuracy_scores = cross_val_score(knn, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='accuracy')
print("F1 Score: {:.4f} +- {:.4f}".format(knn_f1_scores.mean(), knn_f1_scores.std()))
print("Accuracy Score: {:.4f} +- {:.4f}".format(knn_accuracy_scores.mean(), knn_accuracy_scores.std()))

### Naive Bayes

#### Hyperparameters

#### Preprocess

#### F1 Score

In [None]:
gnb = GaussianNB()

# train
X_train,X_test,y_train,y_test=X_train, X_test, y_train, y_test = train_test_split(df.drop('Column14', axis=1), df.Column14,test_size=0.2)

gnb_f1_scores = cross_val_score(gnb, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
gnb_accuracy_scores = cross_val_score(gnb, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='accuracy')
print("F1 Score: {:.4f} +- {:.4f}".format(gnb_f1_scores.mean(), gnb_f1_scores.std()))
print("Accuracy Score: {:.4f} +- {:.4f}".format(gnb_accuracy_scores.mean(), gnb_accuracy_scores.std()))

### Decision Tree (ID3)

In [None]:
rand = 5

#### F1 Score

In [None]:
id3 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

# train
X_train, X_test, y_train, y_test = train_test_split(df.drop('Column14', axis=1), df.Column14,test_size=0.2)
id3.fit(X_train,y_train)

In [None]:
# scores
id3_f1_scores = cross_val_score(id3, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
id3_accuracy_scores = cross_val_score(id3, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='accuracy')
print("F1 Score: {:.4f} +- {:.4f}".format(id3_f1_scores.mean(), id3_f1_scores.std()))
print("Accuracy Score: {:.4f} +- {:.4f}".format(id3_accuracy_scores.mean(), id3_accuracy_scores.std()))

In [None]:
parameters = {'max_depth':[3,6,9,12,15], 'min_samples_split':[2,4,8,16], 'min_samples_leaf':[1,2,4,8,16], 'max_features':[2,4,8,10]}
grid_search = GridSearchCV(id3, parameters, cv=5)
grid_search.fit(X_train,y_train)


print(id3.score(X_test,y_test))
print(grid_search.score(X_test,y_test))

In [None]:
grid_search.best_params_

In [None]:
plot_learning_curve(id3, "Plot", df.drop("Column14",axis=1), df['Column14'])

In [None]:
print(classification_report(id3.predict(X_test),y_test))

In [None]:
id3.feature_importances_

### Multilayer Perceptron

#### Hyperparameters

#### Preprocess

#### F1 Score

### FEATURE SELECTION

Metode feature selection yang digunakan adalah Recursive Feature Elemination (RFE). Metode ini mengurutkan atribut-atribut (ranking) dari urutan 1 (paling penting) hingga seterusnya (semakin tidak penting). Disini masi belom sama column12 13 jadi sementara gua delete column4 5 dulu deh katanya kalo ranking ga jelek dihapus jadi ngurangin f1 score nye

In [None]:
train_df = df
temp = df
le = preprocessing.LabelEncoder()

for column in temp:
    le.fit(temp[column])
    temp[column] = le.transform(temp[column])

y = np.array(temp['Column14'])
x = np.array(temp.drop(['Column14'], 1))

#feature extraction
model = LogisticRegression()
rfe = RFE(model, 1)
fit = rfe.fit(x, y)
print("Feature Ranking: ")
print(fit.ranking_)

In [None]:
train_df.head()

In [None]:
# train_df = train_df.drop(['Column4'], axis=1)
# train_df = train_df.drop(['Column5'], axis=1)

In [None]:
#One Hot Encoding

le = preprocessing.LabelEncoder()

le.fit(train_df['Column14'])
train_df['Column14'] = le.transform(train_df['Column14'])

train_df = pd.get_dummies(train_df)

y = train_df['Column14']
x = train_df.drop(['Column14'], 1)

# train
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.2)

In [None]:
MLPlearn = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5, 2), random_state=1)
score = cross_val_score(MLPlearn, x, y, cv=10)
print("F1 Score: {} +- {}".format(score.mean(), score.std()))

In [None]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(14,), random_state=1)
mlp.fit(X_train, y_train)
mlp_f1_scores = cross_val_score(mlp, x\
                , y, cv=5, scoring='f1_micro')
print("F1 Score: {} +- {}".format(mlp_f1_scores.mean(), mlp_f1_scores.std()))
print(mlp.score(X_test,y_test))

In [None]:
mlp.fit(X_train, y_train)

In [None]:
parameters = {'hidden_layer_sizes': [(5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,), (14,), (5, 5), (6, 6), (7, 7), (10, 10), (100,)]}
grid_search = GridSearchCV(mlp, parameters, cv=5)
grid_search.fit(X_train,y_train)


print(mlp.score(X_test,y_test))
print(grid_search.score(X_test,y_test))

In [None]:
grid_search.best_params_

In [None]:
print(classification_report(mlp.predict(X_test),y_test))

### MLP Abram

In [None]:
params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'solver': ['lbfgs'],
    'alpha': [1e-2, 1e-4, 1e-8],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

best_model = GridSearchCV(MLPClassifier(random_state=1, activation='relu'),
                         params,
                         cv=5,
                         scoring='neg_mean_squared_error',
                         verbose=True)

best_model.fit(df.drop("Column14",axis=1), df['Column14'])

In [None]:
best_model.best_score_

In [None]:
best_model.best_params_

In [None]:
kf = KFold(n_splits=5)

In [None]:
def get_accuracy(x):
    x = np.array(x)
    return x.mean(), x.std()

In [None]:
accuracies = []
sds = []
models = []
for train_index, test_index in kf.split(df):
    mlp_abram = MLPClassifier(alpha=0.01,
                         hidden_layer_sizes=(100, 100),
                         learning_rate='constant',
                         solver='lbfgs',
                         random_state=1,
                         activation='relu')
    train, test = df.iloc[train_index], df.iloc[test_index]
    mlp_abram.fit(train.drop(['Column14'], axis=1), train['Column14'])
    pred = mlp_abram.predict(test.drop(['Column14'], axis=1))
    equals = pred == test['Column14']
    equals = [int(elem) for elem in equals]
    accuracy, sd = get_accuracy(equals)
    accuracies.append(accuracy)
    sds.append(sd)
    models.append(mlp_abram)

In [None]:
print("Best model: {}".format(max(accuracies)))

### Save best model