In [36]:
#load utility libraries
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.tree import export_text as et_dtl
from id3 import export_text as et_id3
from sklearn.preprocessing import LabelEncoder

#load ML libraries
from id3 import Id3Estimator
from sklearn.tree import DecisionTreeClassifier

## Membaca Data

In [25]:
#read iris data
data, target = load_iris(return_X_y=True)
iris_data = pd.DataFrame(data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
iris_data['label'] = pd.Series(target)

#read play-tennis data
tennis_data = pd.read_csv("data/play_tennis.csv")
tennis_data.drop('day', axis=1, inplace=True)

In [3]:
#overview tennis data
tennis_data.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [28]:
#label-encode tennis data
encoder = LabelEncoder()
tennis_data = tennis_data.apply(encoder.fit_transform)
tennis_data.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1


In [5]:
#overview iris data
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Fitting Data dengan Algoritma _Machine Learning_

### 1. Iris Data

In [None]:
X_iris = iris_data.drop('label', axis=1)
y_iris = iris_data['label']

   - #### Fitting with ID3

In [23]:
id3 = Id3Estimator()
id3_fit = id3.fit(X_iris, y_iris)
tree_id3 = et_id3(id3_fit.tree_, feature_names=X_iris.columns.tolist())
print(tree_id3)


petal_length <=2.45: 0 (50) 
petal_length >2.45
|   petal_width <=1.75
|   |   sepal_length <=7.10
|   |   |   sepal_width <=2.85: 1 (27/4) 
|   |   |   sepal_width >2.85: 1 (22) 
|   |   sepal_length >7.10: 2 (1) 
|   petal_width >1.75
|   |   sepal_length <=5.95
|   |   |   sepal_width <=3.10: 2 (6) 
|   |   |   sepal_width >3.10: 1 (1) 
|   |   sepal_length >5.95: 2 (39) 



   - #### Fitting with DTL

In [38]:
dtl = DecisionTreeClassifier()
dtl_fit = dtl.fit(X_iris, y_iris)
tree_dtl = et_dtl(dtl_fit, feature_names=X_iris.columns.tolist())
print(tree_dtl)

|--- petal_length <= 2.45
|   |--- class: 0
|--- petal_length >  2.45
|   |--- petal_width <= 1.75
|   |   |--- petal_length <= 4.95
|   |   |   |--- petal_width <= 1.65
|   |   |   |   |--- class: 1
|   |   |   |--- petal_width >  1.65
|   |   |   |   |--- class: 2
|   |   |--- petal_length >  4.95
|   |   |   |--- petal_width <= 1.55
|   |   |   |   |--- class: 2
|   |   |   |--- petal_width >  1.55
|   |   |   |   |--- sepal_length <= 6.95
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- sepal_length >  6.95
|   |   |   |   |   |--- class: 2
|   |--- petal_width >  1.75
|   |   |--- petal_length <= 4.85
|   |   |   |--- sepal_length <= 5.95
|   |   |   |   |--- class: 1
|   |   |   |--- sepal_length >  5.95
|   |   |   |   |--- class: 2
|   |   |--- petal_length >  4.85
|   |   |   |--- class: 2



### 2. Tennis Data

In [31]:
X_tennis = tennis_data.drop('play', axis=1)
y_tennis = tennis_data['play']

   - #### Fitting with ID3

In [34]:
id3 = Id3Estimator()
id3_fit = id3.fit(X_tennis, y_tennis)
tree_id3 = et_id3(id3_fit.tree_, feature_names=X_tennis.columns.tolist())
print(tree_id3)


outlook <=0.50: 1 (4) 
outlook >0.50
|   humidity <=0.50
|   |   temp <=1.50: 0 (2) 
|   |   temp >1.50
|   |   |   wind <=0.50: 0 (1) 
|   |   |   wind >0.50: 0 (1/1) 
|   humidity >0.50
|   |   wind <=0.50
|   |   |   temp <=1.00: 0 (1) 
|   |   |   temp >1.00: 1 (1) 
|   |   wind >0.50: 1 (3) 



   - #### Fitting with DTL

In [39]:
dtl = DecisionTreeClassifier()
dtl_fit = dtl.fit(X_tennis, y_tennis)
tree_dtl = et_dtl(dtl_fit, feature_names=X_tennis.columns.tolist())
print(tree_dtl)

|--- outlook <= 0.50
|   |--- class: 1
|--- outlook >  0.50
|   |--- humidity <= 0.50
|   |   |--- outlook <= 1.50
|   |   |   |--- wind <= 0.50
|   |   |   |   |--- class: 0
|   |   |   |--- wind >  0.50
|   |   |   |   |--- class: 1
|   |   |--- outlook >  1.50
|   |   |   |--- class: 0
|   |--- humidity >  0.50
|   |   |--- wind <= 0.50
|   |   |   |--- outlook <= 1.50
|   |   |   |   |--- class: 0
|   |   |   |--- outlook >  1.50
|   |   |   |   |--- class: 1
|   |   |--- wind >  0.50
|   |   |   |--- class: 1



## Menjawab Pertanyaan

**Note**: DTL Sklearn menggunakan algoritma CART (Classification and Regression Tree)

#### a. Penentuan atribut terbaik

##### - DTL
Menggunakan Gini index sebagai metrik, yang rumusnya dinyatakan sebagai berikut:
$$Gini=1-\Sigma^{C}_{i=1}(p_i)^2$$
dimana $i$: value-value pada label

Untuk tiap atribut, pada setiap iterasi, dihitung Gini index untuk masing-masing. Atribut terbaik adalah yang Gini indexnya paling tinggi

##### - ID3 (Buku)

Menggunakan Entropy dan Information Gain

$$Entropy=\Sigma^{C}_{i=1}(-p_i)*log_2(p_i)$$

$$Gain(S,A)=Entropy(S)-\Sigma_{v \epsilon Values(A)}(\frac {\mid S_v \mid} {\mid S \mid})Entropy(S_v)$$

Untuk setiap atribut dilakukan perhitungan gain. Atribut terbaik adalah yang memiliki nilai Gain terbesar

##### - ID3Estimator (Library)
Sama dengan ID3 pada buku

#### b. Penanganan label dari cabang setiap nilai atribut

##### - DTL

##### - ID3

Lab

##### - ID3Estimator

Lab

#### c. Penentuan label jika examples kosong di cabang tersebut

##### - DTL

##### - ID3

##### - ID3Estimator

Lab

#### d. Penanganan atribut kontinu

##### - DTL

##### - ID3

##### - ID3Estimator

Lab

#### e. Penanganan atribut dengan missing values

##### - DTL

##### - ID3

##### - ID3Estimator

Lab

#### f. Pruning dan parameter confidence

##### - DTL

##### - ID3

##### - ID3Estimator

Lab