In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# load data

# data 1 : breast cancer
data1 = load_breast_cancer()

df1 = pd.DataFrame(data1.data, columns=data1.feature_names)

df1['target']= data1.target

df1.head()


# data 2 : tennis
df2 = pd.read_csv('datasets/PlayTennis.csv')

df2.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
# split to train 80% - test 20% 

# breast cancer
x1 = df1.drop(['target'], axis=1)
y1 = df1['target']

X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size = 0.2, random_state = 1)

# tennis
x2 = df2.drop(['play'], axis=1)
y2 = df2['play']

X_train2, X_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size = 0.2, random_state = 1)

In [6]:
s = (X_train2.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['outlook', 'temp', 'humidity']


In [7]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train2 = X_train2.copy()
label_X_test2 = X_test2.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_X_train2[col] = label_encoder.fit_transform(X_train2[col])
    label_X_test2[col] = label_encoder.transform(X_test2[col])

label_y_train2 = label_encoder.fit_transform(y_train2)
label_y_test2 = label_encoder.transform(y_test2)
    
#print(label_y_train2)
#y_train2.head()

In [8]:
# ID3 Algorithm
import six
import sys
sys.modules['sklearn.externals.six'] = six

from id3 import Id3Estimator, export

id3_model1 = Id3Estimator()
id3_model2 = Id3Estimator()

id3_model1.fit(X_train1, y_train1)
id3_model2.fit(label_X_train2, label_y_train2)

# export text
tree_text1 = export.export_text(id3_model1.tree_, feature_names=list(x1.columns))
tree_text2 = export.export_text(id3_model2.tree_, feature_names=list(x2.columns))

print(tree_text2)


outlook <=0.50: 1 (3) 
outlook >0.50
|   humidity <=0.50: 0 (3) 
|   humidity >0.50
|   |   windy <=0.50: 1 (3) 
|   |   windy >0.50
|   |   |   temp <=1.00: 0 (1) 
|   |   |   temp >1.00: 1 (1) 



In [9]:
# export graphviz ver.
# export.export_graphviz(id3_model.tree_, "out.dot", feature_names=list(x1.columns))

In [10]:
# Model Evaluation
from sklearn.metrics import accuracy_score, f1_score

# breast cancer
predict_train1 = id3_model1.predict(X_train1)

print("Accuracy: ", accuracy_score(predict_train1, y_train1))
print("F1 score: ", f1_score(predict_train1, y_train1))


# tennis
predict_train2 = id3_model2.predict(label_X_train2)

print("Accuracy: ", accuracy_score(predict_train2,label_y_train2))
print("F1 score: ", f1_score(predict_train2,label_y_train2))

Accuracy:  1.0
F1 score:  1.0
Accuracy:  1.0
F1 score:  1.0


In [11]:
# Model Prediction
predict_test1 = id3_model1.predict(X_test1)

print("Accuracy: ", accuracy_score(predict_test1,y_test1))
print("F1 score: ", f1_score(predict_test1,y_test1))

# tennis
predict_test2 = id3_model2.predict(label_X_test2)

print("Accuracy: ", accuracy_score(predict_test2,label_y_test2))
print("F1 score: ", f1_score(predict_test2,label_y_test2))

Accuracy:  0.9385964912280702
F1 score:  0.953020134228188
Accuracy:  0.6666666666666666
F1 score:  0.6666666666666666


In [12]:
# Analisis hasil Accuracy dan F1 yang sudah diperoleh untuk semua algoritma
# pembelajaran, dalam bentuk perbandingan nilai dan penjelasan singkat mengenai
# hasil tersebut.