#### **Tugas 1**

Terdapat dataset **mushroom**. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [30]:
# Load dataset
df = pd.read_csv('data/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [31]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [32]:
# Encode semua kolom kategorikal menjadi numerik
label_encoder = LabelEncoder()
for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

In [33]:
# Pisahkan fitur (X) dan label (y)
X = df.iloc[:,1:]
y = df['class']

In [34]:
# Bagi dataset menjadi data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [35]:
# Hyperparameter tuning untuk Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, n_jobs=-1)
grid_dt.fit(X_train, y_train)

In [36]:
# Sesuaikan ke set training
grid_dt.fit(X_train, y_train)

# Dapatkan hasil terbaik
best_dt = grid_dt.best_estimator_
print(f"Best hyperparameters: {grid_dt.best_params_}")

# Memprediksi set test menggunakan model terbaik
y_pred_dt = best_dt.predict(X_test)

# Menghitung akurasi pada data pelatihan
y_train_pred = best_dt.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)

# Menghitung akurasi pada data pengujian
acc_test = accuracy_score(y_test, y_pred_dt)

# Print hasil evaluasi
print(f'Accuracy on train: {acc_train:.2f}')
print(f'Accuracy on test: {acc_test:.2f}')

Best hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy on train: 1.00
Accuracy on test: 1.00


In [37]:
# Hyperparameter tuning untuk Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, n_jobs=-1)
grid_rf.fit(X_train, y_train)

In [38]:
# Buat instance RandomForestClassifier
rf = RandomForestClassifier(random_state=1)

# Sesuaikan GridSearchCV ke set training
grid_rf.fit(X_train, y_train)

# Tampilkan hyperparameter terbaik
print("Best Hyperparameters:", grid_rf.best_params_)

# Gunakan model dengan hyperparameter terbaik
best_rf = grid_rf.best_estimator_

# Memprediksi label set test
y_pred_rf = best_rf.predict(X_test)

# Menghitung akurasi pada data pelatihan
y_train_pred = best_rf.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)

# Menghitung akurasi pada data pengujian
acc_test = accuracy_score(y_test, y_pred_rf)

# Print hasil evaluasi
print(f'Accuracy on train: {acc_train:.2f}')
print(f'Accuracy on test: {acc_test:.2f}')

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy on train: 1.00
Accuracy on test: 1.00


In [39]:
# Dapatkan model terbaik dari GridSearchCV
best_dt = grid_dt.best_estimator_
best_rf = grid_rf.best_estimator_

In [40]:
# Evaluasi kinerja model di data uji
y_pred_dt = best_dt.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Tampilkan hasil
print("Akurasi Decision Tree terbaik: ", accuracy_dt)
print("Parameter terbaik Decision Tree: ", grid_dt.best_params_)
print("Akurasi Random Forest terbaik: ", accuracy_rf)
print("Parameter terbaik Random Forest: ", grid_rf.best_params_)

Akurasi Decision Tree terbaik:  1.0
Parameter terbaik Decision Tree:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Akurasi Random Forest terbaik:  1.0
Parameter terbaik Random Forest:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
