# Advanced Machine Learning 

In [11]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [82]:
# Load dataset (iris), split into X_train, y_train, X_test, y_test!
# Write your code here 
dataset = datasets.load_iris()

X = dataset.data
y = dataset.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

## 1. Voting 

In [49]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Hint: 
# clf_voting = VotingClassifier(estimators=[('label1', clf_1), ('label2', clf_2), ('labelN', clf_N)]) 

# Create the individual models
clf_knn = KNeighborsClassifier(5)
clf_dt = DecisionTreeClassifier()
clf_lr = LogisticRegression()

# Create voting classifier
clf_voting = VotingClassifier(estimators=[('knn', clf_knn),
                                            ('dt', clf_dt),
                                            ('lr', clf_lr)])

# Fit it to the training set and predict
clf_voting.fit(X_train, y_train)
y_pred = clf_voting.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:0.3f}".format(acc))

Accuracy: 1.000




In [50]:
# Comparing the accuracy of the KNN model
import sklearn.metrics as metrics
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy KNN: {:0.3f}".format(acc_knn))
print(metrics.classification_report(y_test, clf_knn.predict(X_test)))

Accuracy KNN: 0.978
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.94      1.00      0.97        15
           2       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [107]:
# Comparing the accuracy of the Decision Tree model
import sklearn.metrics as metrics
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy of decision tree: {:0.3f}".format(acc_dt))
print(metrics.classification_report(y_test, clf_dt.predict(X_test)))

Accuracy of decision tree: 0.978
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.93      0.97        15
           2       0.94      1.00      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [106]:
# Comparing the accuracy of the Logistic Regression model
import sklearn.metrics as metrics
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy: {:0.3f}".format(acc_lr))
print(metrics.classification_report(y_test, clf_lr.predict(X_test)))

Accuracy: 0.911
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.73      0.85        15
           2       0.79      1.00      0.88        15

    accuracy                           0.91        45
   macro avg       0.93      0.91      0.91        45
weighted avg       0.93      0.91      0.91        45





## 2. Averaging 

In [53]:
# Template for averaging Classifier 

from sklearn.ensemble import VotingClassifier

# clf_voting = VotingClassifier(
# estimators=[
#('label1', clf_1),
#('label2', clf_2),
#...
#('labelN', clf_N)],
#voting='soft',
#weights=[w_1, w_2, ..., w_N]
#)

In [105]:
# Initiate the individual models 

# Write your code here! 
clf_knn = KNeighborsClassifier(5)
clf_dt = DecisionTreeClassifier()
clf_lr = LogisticRegression()

# Create averaging classifier

# Write your code here! 
clf_averaging = VotingClassifier(estimators=[
                                ('knn', clf_knn),
                                ('dt', clf_dt),
                                ('lr', clf_lr)],
                                voting='soft',
                                weights=[2, 1, 2]
                                )

clf_averaging.fit(X_train, y_train)
y_pred_avg = clf_averaging.predict(X_test)

acc_avg = accuracy_score(y_test, y_pred_avg)
print("Accuracy of averaging method: {:0.3f}".format(acc_avg))
print(metrics.classification_report(y_test, clf_averaging.predict(X_test)))

Accuracy of averaging method: 0.956
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.93      0.93      0.93        15
           2       0.93      0.93      0.93        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45





## 3. Bagging

In [104]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(3),
                           max_samples=0.5, max_features=0.5)

bagging.fit(X_train, y_train)
y_pred_bag = bagging.predict(X_test)

acc_bag = accuracy_score(y_test, y_pred_bag)
print("Accuracy of bagging method: {:0.3f}".format(acc_bag))
print(metrics.classification_report(y_test, bagging.predict(X_test)))

Accuracy of bagging method: 0.911
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.82      0.93      0.87        15
           2       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



In [103]:
# Write your code here if base classifier = decision tree!
bagging_dt = BaggingClassifier(DecisionTreeClassifier(),
                           max_samples=0.5, max_features=0.5)

bagging_dt.fit(X_train, y_train)
y_pred_bagdt = bagging_dt.predict(X_test)

acc_bagdt = accuracy_score(y_test, y_pred_bagdt)
print("Accuracy of bagging method: {:0.3f}".format(acc_bagdt))
print(metrics.classification_report(y_test, bagging_dt.predict(X_test)))

Accuracy of bagging method: 0.911
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.82      0.93      0.87        15
           2       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



In [120]:
# Write your code here if you use RandomForest, compare with above!

from sklearn.ensemble import RandomForestClassifier

randomforest = BaggingClassifier(RandomForestClassifier(n_estimators=100),
                           max_samples=0.5, max_features=0.5)

randomforest.fit(X_train, y_train)
y_pred_rf = randomforest.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of random forest method: {:0.3f}".format(acc_rf))
print(metrics.classification_report(y_test, randomforest.predict(X_test)))

Accuracy of random forest method: 0.911
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.82      0.93      0.87        15
           2       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



## Bagging using German credit data

In [93]:
# Bagging using German credit data

bc_data = datasets.load_breast_cancer()
X_bc = bc_data.data
y_bc = bc_data.target

In [114]:
bc_data_df = pd.DataFrame(bc_data.data, columns=bc_data.feature_names)
bc_data_df['target'] = pd.Series(y_bc)
bc_data_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [109]:
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size=0.3, random_state=42)

In [112]:
bagging_bc = BaggingClassifier(DecisionTreeClassifier(),
                           max_samples=0.5, max_features=0.5, random_state=1)

bagging_bc.fit(X_train_bc, y_train_bc)
y_pred_bagbc = bagging_bc.predict(X_test_bc)

acc_bagbc = accuracy_score(y_test_bc, y_pred_bagbc)
print("Accuracy of bagging decision tree method: {:0.3f}".format(acc_bagbc))
print(metrics.classification_report(y_test_bc, bagging_bc.predict(X_test_bc)))

Accuracy of bagging decision tree method: 0.965
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        63
           1       0.97      0.97      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [116]:
bagging_bcknn = BaggingClassifier(KNeighborsClassifier(5),
                           max_samples=0.5, max_features=0.5, random_state=1)

bagging_bcknn.fit(X_train_bc, y_train_bc)
y_pred_bagbcknn = bagging_bcknn.predict(X_test_bc)

acc_bagbcknn = accuracy_score(y_test_bc, y_pred_bagbcknn)
print("Accuracy of bagging K-Nearest Neighbors method: {:0.3f}".format(acc_bagbcknn))
print(metrics.classification_report(y_test_bc, bagging_bcknn.predict(X_test_bc)))

Accuracy of bagging K-Nearest Neighbors method: 0.953
              precision    recall  f1-score   support

           0       0.98      0.89      0.93        63
           1       0.94      0.99      0.96       108

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171



In [119]:
bagging_bcrf = BaggingClassifier(RandomForestClassifier(n_estimators=20),
                           max_samples=0.5, max_features=0.5, random_state=1)

bagging_bcrf.fit(X_train_bc, y_train_bc)
y_pred_bagbcrf = bagging_bcrf.predict(X_test_bc)

acc_bagbcrf = accuracy_score(y_test_bc, y_pred_bagbcrf)
print("Accuracy of bagging random forest method: {:0.3f}".format(acc_bagbcrf))
print(metrics.classification_report(y_test_bc, bagging_bcrf.predict(X_test_bc)))

Accuracy of bagging random forest method: 0.965
              precision    recall  f1-score   support

           0       0.98      0.92      0.95        63
           1       0.96      0.99      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.97      0.96      0.96       171



## 4. Boosting
Source: https://scikit-learn.org/stable/modules/ensemble.html

In [121]:
from sklearn.ensemble import AdaBoostClassifier

In [141]:
clf_ada = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=50,
learning_rate=0.11,
random_state=1
)

# base_estimator
# Default: Decision Tree (max_depth=1)
# n_estimators
# Default: 50
# learning_rate
# Default: 1.0
# Trade-off between n_estimators and
# learning_rate

clf_ada.fit(X_train_bc, y_train_bc)
y_pred_ada= clf_ada.predict(X_test_bc)

acc_ada = accuracy_score(y_test_bc, y_pred_ada)
print("Accuracy of AdaBoost method: {:0.3f}".format(acc_ada))
print(metrics.classification_report(y_test_bc, clf_ada.predict(X_test_bc)))

Accuracy of AdaBoost method: 0.936
              precision    recall  f1-score   support

           0       0.89      0.94      0.91        63
           1       0.96      0.94      0.95       108

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.93       171
weighted avg       0.94      0.94      0.94       171



### Create AdaBoost Classifier for iris dataset!

In [136]:
# Write your code here!
from sklearn.ensemble import AdaBoostRegressor
import warnings
warnings.filterwarnings('ignore')

dataset_dia = datasets.load_diabetes()

X_dia = dataset_dia.data
y_dia = dataset_dia.target

X_train_dia, X_test_dia, y_train_dia, y_test_dia = train_test_split(X_dia, y_dia, test_size=0.3, random_state=42)

In [146]:
np.random.seed(123)
reg_ada = AdaBoostRegressor(
LogisticRegression(),
n_estimators=50,
learning_rate=0.5,
loss='square',
random_state=1
)

# base_estimator
# Default: Decision Tree (max_depth=3)
# loss
# linear (default)
# square
# exponential


reg_ada.fit(X_train_dia, y_train_dia)
y_pred_regada= reg_ada.predict(X_test_dia)

# acc_regada = accuracy_score(y_test_dia, y_pred_regada)
# print("Accuracy of AdaBoost method: {:0.3f}".format(acc_regada))
# print(metrics.classification_report(y_test_dia, reg_ada.predict(X_test_dia)))

from sklearn.metrics import r2_score, mean_squared_error
print('The r2 score using AdaBoost method is {:0.3f}'.format(r2_score(y_test_dia, y_pred_regada)))
print('The mean square error using AdaBoost method is {:0.3f}'.format(mean_squared_error(y_test_dia, y_pred_regada)))

The r2 score using AdaBoost method is 0.199
The mean square error using AdaBoost method is 4322.925


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

### Create GradientBoostingClassifier for iris Dataset!

In [5]:
# Write your code here!

In [147]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
# Cek di sini: 
# https://stackoverflow.com/questions/35139108/how-to-install-xgboost-in-anaconda-python-windows-platform

### Create XGBoost, lightgbm, catboost for iris Dataset

## 5. Stacking  

In [148]:
from mlxtend.classifier import StackingClassifier

In [155]:
# Instantiate the 1st-layer classifiers
clf_knn = KNeighborsClassifier(4)
clf_dt = DecisionTreeClassifier(random_state=1)
clf_rf = RandomForestClassifier(n_estimators=50, random_state=1)

# Instantiate the 2nd-layer classifier
clf_lr = LogisticRegression(random_state=1)

# Build the Stacking classifier
clf_stack = StackingClassifier(
classifiers=[clf_knn, clf_dt, clf_rf],
meta_classifier=clf_lr,
use_probas=False,
use_features_in_secondary=False)

# Use the fit and predict methods
# like with scikit-learn estimators
clf_stack.fit(X_train_bc, y_train_bc)
y_pred_stack = clf_stack.predict(X_test_bc)

acc_stack = accuracy_score(y_test_bc, y_pred_stack)
print("Accuracy of Stacking method: {:0.3f}".format(acc_stack))
print(metrics.classification_report(y_test_bc, clf_stack.predict(X_test_bc)))

Accuracy of Stacking method: 0.982
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        63
           1       0.97      1.00      0.99       108

    accuracy                           0.98       171
   macro avg       0.99      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

