# Tugas 2: Decision Tree vs AdaBoost

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.impute import SimpleImputer

# Load dataset mushrooms
data_mushrooms = pd.read_csv('data/mushrooms.csv')

# Ubah fitur kategori menjadi numerik 
label_encoder = LabelEncoder()
for column in data_mushrooms.columns:
    data_mushrooms[column] = label_encoder.fit_transform(data_mushrooms[column])

# Split data menjadi fitur dan label
X_mushrooms = data_mushrooms.drop('class', axis=1)
y_mushrooms = data_mushrooms['class']

# Split data into training and testing sets
X_train_mushrooms, X_test_mushrooms, y_train_mushrooms, y_test_mushrooms = train_test_split(X_mushrooms, y_mushrooms, test_size=0.3, random_state=42)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning untuk Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train_mushrooms, y_train_mushrooms)

# Hasil terbaik dari Decision Tree
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test_mushrooms)

# Akurasi dan laporan klasifikasi
accuracy_dt = accuracy_score(y_test_mushrooms, y_pred_dt) * 100  # Akurasi dalam persen
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}%")
print(classification_report(y_test_mushrooms, y_pred_dt))

# AdaBoost
ada = AdaBoostClassifier()

# Hyperparameter tuning untuk AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1, 10]
}

grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=1)
grid_search_ada.fit(X_train_mushrooms, y_train_mushrooms)

# Hasil terbaik dari AdaBoost
best_ada = grid_search_ada.best_estimator_
y_pred_ada = best_ada.predict(X_test_mushrooms)

# Akurasi dan laporan klasifikasi
accuracy_ada = accuracy_score(y_test_mushrooms, y_pred_ada) * 100  # Akurasi dalam persen
print(f"Best AdaBoost Parameters: {grid_search_ada.best_params_}")
print(f"AdaBoost Accuracy: {accuracy_ada:.2f}%")
print(classification_report(y_test_mushrooms, y_pred_ada))

# Load dataset diabetes
df = pd.read_csv("data/diabetes.csv")

# Cek data
print(df.head())
print(df.isnull().sum())

# Ganti nilai nol yang tidak valid pada fitur
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)
df[feature_columns] = fill_values.fit_transform(df[feature_columns])

# Split Data
X = df[feature_columns]
y = df.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standarisasi Fitur
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Model Logistic Regression
log_reg = LogisticRegression(random_state=42)

# Hyperparameter Tuning untuk Logistic Regression
param_grid_logreg = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200, 500]
}

grid_search_logreg = GridSearchCV(estimator=log_reg, param_grid=param_grid_logreg, cv=5, verbose=1, n_jobs=-1)
grid_search_logreg.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_logreg = grid_search_logreg.best_estimator_.predict(X_test_std)

# Evaluasi Logistic Regression
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy (Logistic Regression): {accuracy_logreg * 100:.2f}%")
print(f"Classification Report (Logistic Regression):\n{classification_report(y_test, y_pred_logreg)}")

# Model SVM
svc = SVC(kernel='poly', probability=True, random_state=42)

# Hyperparameter Tuning untuk SVM
param_grid_svc = {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
}

grid_search_svc = GridSearchCV(estimator=svc, param_grid=param_grid_svc, cv=5, verbose=1, n_jobs=-1)
grid_search_svc.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_svc = grid_search_svc.best_estimator_.predict(X_test_std)

# Evaluasi SVM
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f"Accuracy (SVM): {accuracy_svc * 100:.2f}%")
print(f"Classification Report (SVM):\n{classification_report(y_test, y_pred_svc)}")

# Model Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter Tuning untuk Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, verbose=1, n_jobs=-1)
grid_search_dt.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_dt = grid_search_dt.best_estimator_.predict(X_test_std)

# Evaluasi Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy (Decision Tree): {accuracy_dt * 100:.2f}%")
print(f"Classification Report (Decision Tree):\n{classification_report(y_test, y_pred_dt)}")

# Ensemble Voting
log_reg_best = grid_search_logreg.best_estimator_  # Logistic Regression terbaik
svc_best = grid_search_svc.best_estimator_         # SVM terbaik
dt_best = grid_search_dt.best_estimator_           # Decision Tree terbaik

# Ensemble Voting dengan soft voting
voting_clf = VotingClassifier(estimators=[('lr', log_reg_best), ('svc', svc_best), ('dt', dt_best)], voting='soft')

# Fit model pada data train
voting_clf.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_voting = voting_clf.predict(X_test_std)

# Evaluasi Ensemble Voting
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Accuracy (Ensemble Voting): {accuracy_voting * 100:.2f}%")
print(f"Classification Report (Ensemble Voting):\n{classification_report(y_test, y_pred_voting)}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1257
           1       1.00      1.00      1.00      1181

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best AdaBoost Parameters: {'learning_rate': 1, 'n_estimators': 50}
AdaBoost Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1257
           1       1.00      1.00      1.00      1181

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2             