# Da best heart disease classifier in town
- 13516083 / Abram Perdanaputra
- 13516090 / Timothy Thamrin Andrew Hamonangan Sihombing
- 13516093 / Muhammad Farhan
- 13516153 / Dimas Aditia Pratikto
- 13516155 / Restu Wahyu Kartiko

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

## Read dataset

Now we're going to read the `tubes2_HeartDisease_train` and `tubes2_HeartDisease_test`

In [2]:
heart_disease = {}
heart_disease['columns_detail'] = [
    'Age', 
    'Sex', 
    'Pain type', 
    'Blood pressure', 
    'Serum cholesterol', 
    'Fasting blood sugar > 120mg/dl', 
    'Resting ECG', 
    'Max heart rate achieved', 
    'exercise induced agina', 
    'ST depression induced by exercise relative to rest', 
    'Peak exercise ST segment', 
    'Number of major vessels colored by flourosopy', 
    'Thal', 
    'Diagnosis'
]
heart_disease['train'] = pd.read_csv('../data/tubes2_HeartDisease_train.csv')
heart_disease['test'] = pd.read_csv('../data/tubes2_HeartDisease_test.csv')

In [3]:
heart_disease['train'].head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3


## Data preprocessing

In [4]:
# Helpers
def preprocess_data(data):
    data.Column4 = pd.to_numeric(data.Column4, errors='coerce')
    data.Column5 = pd.to_numeric(data.Column5, errors='coerce')
    data.Column6 = pd.to_numeric(data.Column6, errors='coerce')
    data.Column8 = pd.to_numeric(data.Column8, errors='coerce')
    data.Column9 = pd.to_numeric(data.Column9, errors='coerce')
    data.Column10 = pd.to_numeric(data.Column10, errors='coerce')
    data.Column12 = pd.to_numeric(data.Column12, errors='coerce')
    
    data.loc[data['Column3'] == 1, 'Column3'] = 'typical_agina'
    data.loc[data['Column3'] == 2, 'Column3'] = 'atypical_agina'
    data.loc[data['Column3'] == 3, 'Column3'] = 'non_aginal_pain'
    data.loc[data['Column3'] == 4, 'Column3'] = 'asymtotic'
    
    data.loc[data['Column4'].isnull(), 'Column4'] = data['Column4'].mean()
    data.loc[data['Column5'].isnull(), 'Column5'] = data['Column5'].mean()
    data.loc[data['Column6'].isnull(), 'Column6'] = 0

    data.loc[data['Column7'] == '0', 'Column7'] = 'normal'
    data.loc[data['Column7'] == '1', 'Column7'] = 'having ST-T wave abnormality'
    data.loc[data['Column7'] == '2', 'Column7'] = 'left ventricular hyperthrophy'
    data.loc[data['Column7'] == '?', 'Column7'] = 'normal'

    data.loc[data['Column8'].isnull(), 'Column8'] = 138.348299
    data.loc[data['Column9'].isnull(), 'Column9'] = 0.0
    data.loc[data['Column10'].isnull(), 'Column10'] = 3.937397

    data.loc[data['Column11'] == '1', 'Column11'] = 'upsloping'
    data.loc[data['Column11'] == '2', 'Column11'] = 'flat'
    data.loc[data['Column11'] == '3', 'Column11'] = 'downsloping'
    data.loc[data['Column11'] == '?', 'Column11'] = 'flat'
    
    data.loc[data['Column12'].isnull(), 'Column12'] = 0.686792

    data.loc[data['Column13'] == '3', 'Column13'] = 'normal'
    data.loc[data['Column13'] == '6', 'Column13'] = 'fixed_defect'
    data.loc[data['Column13'] == '7', 'Column13'] = 'reversable_defect'
    data.loc[data['Column13'] == '?', 'Column13'] = 'normal'
    return data

2 jenis:
1. numeric
    - preprocess: normalize
    - null: mean or median, or buat model yang ditrain di datayang ganull, terus predict null
2. categorical
    1. Nominal: urutan gangaruh (fakultas)
         - preprocess: one hot encoding
    2. ordinal: categorical yang urutan ngaruh (rating)
         - preprocess: label encoding pd.get dummies

In [5]:
heart_disease['train'].replace([np.inf, -np.inf], np.nan)
heart_disease['train'] = preprocess_data(heart_disease['train'])
heart_disease['train']#[heart_disease['train']['Column7'].astype(str) == '?']

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,asymtotic,125.000000,216.000000,0.0,normal,140.000000,0.0,0.000000,flat,0.686792,normal,1
1,55,1,asymtotic,158.000000,217.000000,0.0,normal,110.000000,1.0,2.500000,flat,0.686792,normal,1
2,54,0,non_aginal_pain,135.000000,304.000000,1.0,normal,170.000000,0.0,0.000000,upsloping,0.000000,normal,0
3,48,0,non_aginal_pain,120.000000,195.000000,0.0,normal,125.000000,0.0,0.000000,flat,0.686792,normal,0
4,50,1,asymtotic,120.000000,0.000000,0.0,having ST-T wave abnormality,156.000000,1.0,0.000000,upsloping,0.686792,fixed_defect,3
5,64,0,asymtotic,130.000000,303.000000,0.0,normal,122.000000,0.0,2.000000,flat,2.000000,normal,0
6,63,1,asymtotic,130.000000,308.000000,0.0,normal,138.000000,1.0,2.000000,flat,0.686792,normal,2
7,58,1,atypical_agina,130.000000,251.000000,0.0,normal,110.000000,0.0,0.000000,flat,0.686792,normal,0
8,42,1,atypical_agina,150.000000,268.000000,0.0,normal,136.000000,0.0,0.000000,flat,0.686792,normal,0
9,54,1,non_aginal_pain,120.000000,258.000000,0.0,left ventricular hyperthrophy,147.000000,0.0,4.000000,flat,0.000000,reversable_defect,0


In [6]:
df = pd.get_dummies(heart_disease['train'])
df.head()

Unnamed: 0,Column1,Column2,Column4,Column5,Column6,Column8,Column9,Column10,Column12,Column14,...,Column3_typical_agina,Column7_having ST-T wave abnormality,Column7_left ventricular hyperthrophy,Column7_normal,Column11_downsloping,Column11_flat,Column11_upsloping,Column13_fixed_defect,Column13_normal,Column13_reversable_defect
0,54,1,125.0,216.0,0.0,140.0,0.0,0.0,0.686792,1,...,0,0,0,1,0,1,0,0,1,0
1,55,1,158.0,217.0,0.0,110.0,1.0,2.5,0.686792,1,...,0,0,0,1,0,1,0,0,1,0
2,54,0,135.0,304.0,1.0,170.0,0.0,0.0,0.0,0,...,0,0,0,1,0,0,1,0,1,0
3,48,0,120.0,195.0,0.0,125.0,0.0,0.0,0.686792,0,...,0,0,0,1,0,1,0,0,1,0
4,50,1,120.0,0.0,0.0,156.0,1.0,0.0,0.686792,3,...,0,1,0,0,0,0,1,1,0,0


## Experiment

In [7]:
df

Unnamed: 0,Column1,Column2,Column4,Column5,Column6,Column8,Column9,Column10,Column12,Column14,...,Column3_typical_agina,Column7_having ST-T wave abnormality,Column7_left ventricular hyperthrophy,Column7_normal,Column11_downsloping,Column11_flat,Column11_upsloping,Column13_fixed_defect,Column13_normal,Column13_reversable_defect
0,54,1,125.000000,216.000000,0.0,140.000000,0.0,0.000000,0.686792,1,...,0,0,0,1,0,1,0,0,1,0
1,55,1,158.000000,217.000000,0.0,110.000000,1.0,2.500000,0.686792,1,...,0,0,0,1,0,1,0,0,1,0
2,54,0,135.000000,304.000000,1.0,170.000000,0.0,0.000000,0.000000,0,...,0,0,0,1,0,0,1,0,1,0
3,48,0,120.000000,195.000000,0.0,125.000000,0.0,0.000000,0.686792,0,...,0,0,0,1,0,1,0,0,1,0
4,50,1,120.000000,0.000000,0.0,156.000000,1.0,0.000000,0.686792,3,...,0,1,0,0,0,0,1,1,0,0
5,64,0,130.000000,303.000000,0.0,122.000000,0.0,2.000000,2.000000,0,...,0,0,0,1,0,1,0,0,1,0
6,63,1,130.000000,308.000000,0.0,138.000000,1.0,2.000000,0.686792,2,...,0,0,0,1,0,1,0,0,1,0
7,58,1,130.000000,251.000000,0.0,110.000000,0.0,0.000000,0.686792,0,...,0,0,0,1,0,1,0,0,1,0
8,42,1,150.000000,268.000000,0.0,136.000000,0.0,0.000000,0.686792,0,...,0,0,0,1,0,1,0,0,1,0
9,54,1,120.000000,258.000000,0.0,147.000000,0.0,4.000000,0.000000,0,...,0,0,1,0,0,1,0,0,0,1


In [17]:
cv = 15

### K-Nearest Neighbor

Hyperparameters

In [18]:
num_neighbor = 5

Preprocess

F1 Score

In [19]:
KNN = KNeighborsClassifier(n_neighbors=num_neighbor, algorithm='ball_tree')
df.loc[:, df.columns != 'b']
f1_scores = cross_val_score(KNN, df.loc[:, df.columns != 'Column14']\
                , df['Column14'], cv=cv, scoring='f1_micro')
print("F! Score: {} +- {}".format(f1_scores.mean(), f1_scores.sd()))

array([0.46296296, 0.55555556, 0.41509434, 0.47169811, 0.51923077,
       0.51923077, 0.44230769, 0.53846154, 0.49019608, 0.43137255,
       0.52941176, 0.58823529, 0.47058824, 0.52941176, 0.47058824])

### Naive Bayes

Hyperparameters

Preprocess

F1 Score

### Decision Tree (ID3)

Hyperparameters

Preprocess

F1 Score

### Multilayer Perceptron

Hyperparameters

Preprocess

huyu