## Import library yang digunakan

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as p

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix

## Load dataset

In [None]:
data = 'heart.csv'
df   = pd.read_csv(data)

df.head()

## Preprocessing

### make feature and target data

In [None]:
X = df.drop(columns = ['target'])
y = df['target']

print(X.shape)
print(y.shape)

### split the dataset

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify = y)

print(x_train.shape)
print(x_test.shape)

## Make decision tree model using library

In [None]:
clf = #INPUT CODE HERE
clf.fit(x_train, y_train)

y_train_pred = clf.predict(x_train)
y_test_pred  = clf.predict(x_test)

### visualize the model

In [None]:
plt.figure(figsize = (20, 20))

features = df.columns
classes  = ['Not heart disease', 'heart disease']

tree.plot_tree() # Fill the parameter
plt.show()

### tampilkan confusion matrix

In [None]:
# fungsi consufion matrix
def plot_confusionmatrix(y_train_pred, y_train,dom):
    print(f'{dom} Confusion matrix')

    cf = # PUT CODE HERE

    sns.heatmap() # Fill the parameter
    plt.tight_layout()
    plt.show()

In [None]:
print(f'Train score {accuracy_score(y_train_pred, y_train)}')
print(f'Test score {accuracy_score(y_test_pred, y_test)}')

plot_confusionmatrix(y_train_pred, y_train, dom = 'Train')
plot_confusionmatrix(y_test_pred, y_test, dom = 'Test')

## Optimization Technique

### pre-pruning techniques

pre-pruning merupakan teknik untuk menghentikan decision tree di awal. Proses menghentikan decision tree ini dilakukan menggunakan beberapa batasan, seperti membatasi parameter (max_depth, min_samples, dll).

Cara efektif untuk melakukan pre-pruning adalah mencari grid parameter tersebut dan memilih nilai yang memberikan hasil optimal pada data test.


1. max_depth : maximum depth of decision tree
2. min_sample_split: The minimum number of samples required to split an internal node
3. min_samples_leaf: The minimum number of samples required to be at a leaf node.

Implementasi teknik pre-pruning

In [None]:
params = {} # Fill the parameter

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV() # Fill the parameter

gcv.fit(x_train, y_train)

In [None]:
# PUT CODE HERE

visualisasi

In [None]:
plt.figure(figsize = (20, 20))

features = df.columns
classes  = ['Not heart disease', 'heart disease']

tree.plot_tree() # Fill the parameter
plt.show()

### post-pruning techniques (cost complexity pruning)

ovefitting sering terjadi pada decision tree, dimana untuk mengatasinya salah satu cara yang dapat digunakan ialah dengan memberi batasan pada tree kita. Namun, cara yang paling efisien adalah melalui penggunaan teknik post-pruning yang satunya adalah teknik cost complexity pruning. Teknik ini akan membentu meningkatkan akurasi dari model.

cost complexity pruning merupakan teknik untuk menemukan parameter yang paling sesuai untuk nilai alpha. Akan dicoba untuk mendapat nilai alpha dari tree dan kita evaluasi model hasilnya.

In [None]:
path = # PUT CODE HERE 
ccp_alphas, impurities = # PUT CODE HERE

print(ccp_alphas)

In [None]:
# For each alpha we will append our model to a list
clfs = []

for ccp_alpha in ccp_alphas:
    clf = # PUT CODE HERE
    clf.fit(x_train, y_train)
    clfs.append(clf)

buang element terakhir dalam variabel clfs dan ccp_alphas, karena hanya memiliki 1 node.

In [None]:
clfs        = clfs[:-1]
ccp_alphas  = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth       = [clf.tree_.max_depth for clf in clfs]

# PUT CODE HERE

saat nilai alpha meningkat, jumlah nodes dan kedalaman berkurang.

In [None]:
train_acc = []
test_acc  = []

for c in clfs:
    y_train_pred = c.predict(x_train)
    y_test_pred  = c.predict(x_test)
    train_acc.append(accuracy_score(y_train_pred, y_train))
    test_acc.append(accuracy_score(y_test_pred, y_test))

# PUT CODE HERE

pilih alpha = 0.020

In [None]:
clf_ = tree.DecisionTreeClassifier(random_state = 0, ccp_alpha = 0.020)
clf_.fit(x_train, y_train)

y_train_pred = clf_.predict(x_train)
y_test_pred  = clf_.predict(x_test)

print(f'Train score {accuracy_score(y_train_pred, y_train)}')
print(f'Test score {accuracy_score(y_test_pred, y_test)}')
plot_confusionmatrix(y_train_pred, y_train, dom = 'Train')
plot_confusionmatrix(y_test_pred, y_test, dom = 'Test')

visualisasi

In [None]:
plt.figure(figsize = (20, 20))

features = df.columns
classes  = ['Not heart disease', 'heart disease']

tree.plot_tree(clf_,feature_names = features, class_names = classes, filled = True)
plt.show()