<a href="https://colab.research.google.com/github/andidprastyo/ML-2023/blob/main/05%20-%20Ensemble%20Learning/tugas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

### Load Dataset

In [4]:
df = pd.read_csv('./Dataset/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Data Preprocessing

In [5]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [7]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### Feature Selection

In [8]:
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform)

In [9]:
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Tugas 1 : Decision Tree vs Random Forest

### Modelling

In [11]:
param_grid_decision_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_classifier = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt_classifier, param_grid_decision_tree, cv=5)
grid_search_dt.fit(X_train, y_train)

In [12]:
param_grid_random_forest = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_classifier, param_grid_random_forest, cv=5)
grid_search_rf.fit(X_train, y_train)

In [13]:
best_dt_model = grid_search_dt.best_estimator_
best_dt_params = grid_search_dt.best_params_
best_dt_model.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
best_rf_params = grid_search_rf.best_params_
best_rf_model.fit(X_train, y_train)

In [14]:
y_pred_dt = best_dt_model.predict(X_test)
y_pred_rf = best_rf_model.predict(X_test)

y_train_pred_dt = best_dt_model.predict(X_train)
y_train_pred_rf = best_rf_model.predict(X_train)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

accuracy_train_dt = accuracy_score(y_train, y_train_pred_dt)
accuracy_train_rf = accuracy_score(y_train, y_train_pred_rf)

In [15]:
print("Decision Tree:")
print("Best Parameters:", best_dt_params)
print("Testing Accuracy:", accuracy_dt)
print("Training Accuracy:", accuracy_train_dt)

print("\nRandom Forest:")
print("Best Parameters:", best_rf_params)
print("Testing Accuracy:", accuracy_rf)
print("Training Accuracy:", accuracy_train_rf)

Decision Tree:
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Testing Accuracy: 1.0
Training Accuracy: 1.0

Random Forest:
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Testing Accuracy: 1.0
Training Accuracy: 1.0


## Tugas 2 : Decision Tree vs AdaBoost

In [21]:
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

base_dt = DecisionTreeClassifier(random_state=42)
adaboost_classifier = AdaBoostClassifier(estimator=base_dt, random_state=42)
grid_search_adaboost = GridSearchCV(adaboost_classifier, param_grid_adaboost, cv=5)
grid_search_adaboost.fit(X_train, y_train)

In [24]:
best_adaboost_model = grid_search_adaboost.best_estimator_
best_adaboost_model.fit(X_train, y_train)

y_pred_adaboost = best_adaboost_model.predict(X_test)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)

In [25]:
print("\nAdaBoost:")
print("Best Parameters:", grid_search_adaboost.best_params_)
print("Training Accuracy:", best_adaboost_model.score(X_train, y_train))
print("Testing Accuracy:", accuracy_adaboost)


AdaBoost:
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Training Accuracy: 1.0
Testing Accuracy: 1.0
