## Imports Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Load Dataset

In [2]:
df = pd.read_csv('./Dataset/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# Data Prepocessing

In [3]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

# Seleksi Fitur

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform)
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tugas 1
Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [12]:
# Decision Tree

from sklearn.model_selection import  GridSearchCV
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Tunning hyperparameter
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV untuk mencari hyperparameter terbaik
grid_search_dt = GridSearchCV(dt_classifier, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Cetak hyperparameter terbaik untuk Decision Tree
best_dt_params = grid_search_dt.best_params_
print("Best Hyperparameter : ", best_dt_params)

# Model Decision Tree terbaik
best_dt_model = grid_search_dt.best_estimator_

# Melakukan prediksi
y_pred_dt = best_dt_model.predict(X_test)

# Menghitung akurasi
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Akurasi Decision Tree:", accuracy_dt)


Best Hyperparameter :  {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Akurasi Decision Tree: 1.0


In [13]:
# Inisialisasi model RandomForest
rf = RandomForestClassifier()

# Definisikan grid parameter untuk RandomForest
param_grid_rf = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}
# Lakukan pencarian grid untuk RandomForest
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring='accuracy', cv=5)
grid_search_rf.fit(X_train, y_train)
# Cetak hyperparameter terbaik untuk RandomForest
print("Hyperparameter terbaik untuk RandomForest:", grid_search_rf.best_params_)

# Cetak akurasi terbaik untuk RandomForest pada data uji
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Akurasi terbaik untuk RandomForest pada data uji:", acc_rf)

Hyperparameter terbaik untuk RandomForest: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Akurasi terbaik untuk RandomForest pada data uji: 1.0


# Tugas 2
Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [14]:
# Decision Tree

from sklearn.model_selection import  GridSearchCV
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Tunning hyperparameter
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV untuk mencari hyperparameter terbaik
grid_search_dt = GridSearchCV(dt_classifier, param_grid=param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

# Cetak hyperparameter terbaik untuk Decision Tree
best_dt_params = grid_search_dt.best_params_
print("Best Hyperparameter : ", best_dt_params)

# Model Decision Tree terbaik
best_dt_model = grid_search_dt.best_estimator_

# Melakukan prediksi
y_pred_dt = best_dt_model.predict(X_test)

# Menghitung akurasi
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Akurasi Decision Tree:", accuracy_dt)

Best Hyperparameter :  {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Akurasi Decision Tree: 1.0


In [15]:
#adaBoost

from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=2)

# menyesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print(f"Akurasi \ AdaBoost : {acc_ada}")

Akurasi algoritma AdaBoost : 0.8406153846153847


# Tugas 3
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
<br>- Logistic Regression
<br>- SVM kernel polynomial
<br>- Decission Tree <br>
Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

# Import Library

In [16]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB # import Naive Bayes model Gaussian (asumsi data terdistribusi normal)
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset

In [18]:
df1 = pd.read_csv('./Dataset/diabetes.csv')
df1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data Preprocessing

In [19]:
df1.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [20]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [21]:
df1.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [23]:
# Cek kolom yang bernilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(df1.loc[df1[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [24]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

df1[feature_columns] = fill_values.fit_transform(df1[feature_columns])

In [25]:
X = df1[feature_columns]
y = df1.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standarisasi Fitur

In [27]:
# Karena asumsi Gaussian NB adalah data terdistribusi secara normal, maka kita perlu melakukan standarisasi

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
# Inisialisasi model Logistic Regression
logistic = LogisticRegression()

# Fit ke model Logistic Regression
logistic.fit(X_train_std, y_train)

# Prediksi
y_pred_logistic = logistic.predict(X_test_std)

# Evaluasi akurasi pada data uji
acc_logistic = accuracy_score(y_test, y_pred_logistic)

# Print hasil evaluasi
print("Test set accuracy (Logistic Regression): {:.2f}".format(acc_logistic))
print(f"Test set accuracy: {acc_logistic}")

Test set accuracy (Logistic Regression): 0.74
Test set accuracy: 0.7359307359307359


# SVM kernel polynomial

In [29]:
# Model SVM linier tanpa tunnning hyperparameter
svm_pol= SVC(kernel='poly')

# Fit ke model
svm_pol.fit(X_train_std, y_train)

# Prediksi
y_pred_svm_pol= svm_pol.predict(X_test_std)

# Evaluasi akurasi testing data
acc_svm_pol= accuracy_score(y_test, y_pred_svm_pol)

# Print hasil evaluasi
print("Test set accuracy: {:.2f}".format(acc_svm_pol))
print(f"Test set accuracy: {acc_svm_pol}")

Test set accuracy: 0.70
Test set accuracy: 0.696969696969697


# Decission Tree

In [31]:
dt = DecisionTreeClassifier()

# Latih model pada data pelatihan
dt.fit(X_train, y_train)

# Prediksi
y_pred_dt = dt.predict(X_test)

# Evaluasi akurasi pada data uji
acc_dt = accuracy_score(y_test, y_pred_dt)

# Print hasil evaluasi
print("Test set accuracy : {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy : 0.73
Test set accuracy: 0.7316017316017316


# Voting Classifier

In [32]:
# Definisikan algoritma yang akan digunakan untuk voting

clf1 = DecisionTreeClassifier()
clf2 = SVC(kernel='poly')
clf3 = LogisticRegression()

# model hard voting
voting = VotingClassifier(estimators=[('DecisionTreeClassifieR', clf1), ('SVM-polynomial', clf2), ('Logistic Regression', clf3)], voting='hard')

# Fit model
voting.fit(X_train_std, y_train)

# Prediksi
y_pred_vt1 = voting.predict(X_test_std)

# Evaluasi akurasi testing data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Print hasil evaluasi
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")

Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403
