In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder # Kebutuhan encoding label

In [3]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [5]:

# Menggunakan LabelEncoder untuk mengonversi nilai kategori menjadi ordinal
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [6]:
X = df.drop("class", axis=1)
y = df["class"]
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1


In [7]:
# Cek jumlah fitur dan instance
print(X.shape)

# Cek label
print(y)

(8124, 22)
0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int32


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Training Decision Tree

In [9]:
# Secara default, DecisionTreeClassifier dari scikit-learn akan menggunakan nilai "Gini" untuk kriteria
# Terdapat beberapa "hyperparameter" yang dapat digunakan. Silahkan baca dokumentasi
# Pada kasus ini kita akan menggunakan parameter default
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

# Menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


## Training RandomForest

In [10]:
from sklearn.ensemble import RandomForestClassifier #import RandomForest
# Pada kasus kali ini kita akan menggunakan seluruh parameter default dari RandomForest
# Untuk detail parameter (hyperparameter) silahkan cek dokumentasi
rf = RandomForestClassifier(n_estimators=10, random_state=1)

# Sesuaikan dt ke set training
rf.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rf = rf.predict(X_test)

# Menghitung set accuracy
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")

Test set accuracy: 1.00
Test set accuracy: 1.0


## Decision Tree dengan hyperparameter optimization

In [11]:
from sklearn.model_selection import GridSearchCV
#Mengatur nilai untuk parameter
max_depth = range(1,10)
min_samples_split = range(1,10)
min_samples_leaf = range(1,10)
max_features = range(1,10)

#membuat sebuah dictionary untuk  hyperparameter
hyperT = dict(max_depth = max_depth, min_samples_split = min_samples_split, 
              min_samples_leaf = min_samples_leaf, max_features=max_features)

#Mengaplikasikan GridSearchCV untuk mendapatkan milai terbaik untuk hyperparameter
gridT = GridSearchCV(dt, hyperT, cv = 3, verbose = 1, n_jobs = -1)
bestT = gridT.fit(X_train, y_train)

Fitting 3 folds for each of 6561 candidates, totalling 19683 fits


2187 fits failed out of a total of 19683.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2187 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HP\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 969, in fit
    super().fit(
  File "c:\Users\HP\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 265, in fit
    check_scalar(
  File "c:\Users\HP\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.

In [12]:
#Print hyperparameter terbaik
print('The best hyper parameters are: \n',gridT.best_params_)

The best hyper parameters are: 
 {'max_depth': 8, 'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 8}


In [13]:
# Print estimator terbaik
print('The best estimators are: \n',gridT.best_estimator_)

The best estimators are: 
 DecisionTreeClassifier(max_depth=8, max_features=8, min_samples_split=8)


## Fitting Decision Tree dengan Hyperparameter terbaik

In [14]:
#Fitting the decision tree model with the best hyper parameters obtained through GridSearchCV
dt1 = DecisionTreeClassifier(max_depth=5, max_features=6, min_samples_leaf=7,
                       min_samples_split=7)
dt1.fit(X_train,y_train)
y_pred_dt1 = dt1.predict(X_test)

# Menghitung set accuracy
acc_dt1 = accuracy_score(y_test, y_pred_dt1)
print("Test set accuracy: {:.2f}".format(acc_dt1))
print(f"Test set accuracy: {acc_dt1}")

Test set accuracy: 0.95
Test set accuracy: 0.9526153846153846


## RandomForest dengan Hyperparameter Optimization

In [15]:
#Mengatur nilai untuk parameter
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 10, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

#Membuat sebuah dictionary untuk hyperparameters
hyper_rf = dict(n_estimators = n_estimators, max_depth = max_depth, 
              min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf)

#Mengaplikasikan GridSearchCV untuk mendapatkan milai terbaik untuk hyperparameter
gridrf = GridSearchCV(rf, hyper_rf, cv = 3, verbose = 1, n_jobs = -1)
bestrf = gridrf.fit(X_train, y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


In [16]:
#Printing the best hyperparameters
print('The best hyper parameters are:\n',gridrf.best_params_)

The best hyper parameters are:
 {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [17]:
#Fitting the random forest model with the best hyper parameters obtained through GridSearchCV
rf1 = RandomForestClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5, n_estimators=100)
rf1.fit(X_train,y_train)
y_pred_rf1 = rf1.predict(X_test)


# Menghitung set accuracy
acc_rf1 = accuracy_score(y_test, y_pred_rf1)
print("Test set accuracy: {:.2f}".format(acc_rf1))
print(f"Test set accuracy: {acc_rf1}")

Test set accuracy: 0.99
Test set accuracy: 0.9870769230769231
