In [1]:
import copy
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Import data
filename = "data/bank.csv"

# Load
data = pd.read_csv(filename, sep=';')
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
cols = ["age", "balance", "duration", "day", "default"]

data = data[cols]
data["default"] = data["default"].apply(lambda x: 0 if x == "no" else 1)
data.head()

Unnamed: 0,age,balance,duration,day,default
0,30,1787,79,19,0
1,33,4789,220,11,0
2,35,1350,185,16,0
3,30,1476,199,3,0
4,59,0,226,5,0


In [4]:
# Splitting input output

target_col = "default"
X = data.drop(columns=target_col, axis=1)
y = data[target_col]


print("X shape ", X.shape)
print("y shape ", y.shape)

X shape  (4521, 4)
y shape  (4521,)


In [5]:
y.value_counts()

default
0    4445
1      76
Name: count, dtype: int64

In [6]:
# Splitting train & test
# training -> latih model
# test -> generalisasi performa model
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X train shape: ", X_train.shape)
print("X test shape: ", X_test.shape)
print("y train shape: ", y_train.shape)
print("y test shape: ", y_test.shape)

X train shape:  (3616, 4)
X test shape:  (905, 4)
y train shape:  (3616,)
y test shape:  (905,)


In [7]:
y_train.value_counts(normalize=True)

default
0    0.983131
1    0.016869
Name: proportion, dtype: float64

In [8]:
y.value_counts(normalize=True)

default
0    0.98319
1    0.01681
Name: proportion, dtype: float64

In [9]:
def _generate_ensemble_estimator(base_estimator, n_estimators):
    estimators = [copy.deepcopy(base_estimator) for i in range(n_estimators)]

    return estimators

In [10]:
def _generate_sample_indices(seed, n_estimators, n_population, n_samples, bootstrap=True):
    # Get the seed
    np.random.seed(seed)

    # Get the bagging indices
    sample_indices = np.random.choice(n_population,
                                      size = (n_estimators, n_samples),
                                      replace = bootstrap)
    return sample_indices

In [11]:
def _generate_feature_indices(seed, n_estimators, n_population, n_features, bootstrap=False):
    np.random.seed(seed)

    # Get the bagging indices
    feature_indices = np.empty((n_estimators, n_features), dtype="int")
    for i in range(n_estimators):
        feature_indices[i] = np.random.choice(n_population, 
                                              n_features, 
                                              replace=bootstrap)
        feature_indices[i].sort()

    return feature_indices

In [12]:
def _predict_ensemble(estimators, feature_indices, X):
    X = np.array(X).copy()
    n_samples = X.shape[0]

    # Prepare the ensemble model
    n_estimators = len(estimators)

    # Create the output
    y_preds = np.empty((n_estimators, n_samples))

    # Fill the output with the given ensemble model
    for i, estimator in enumerate(estimators):
        # Extract the estimators
        X_ = X[:, feature_indices[i]]

        # Get the predictions
        y_preds[i] = estimator.predict(X_)

    return y_preds

In [13]:
class BaseEnsemble:
    def __init__(
        self,
        estimator,
        n_estimators,
        max_features=None,
        random_state=None
    ):
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = random_state

    def fit(self, X, y):
        # Sesuaikan format input
        X = np.array(X).copy()
        y = np.array(y).copy()

        self.classes = list(sorted(set(y)))
        
        # Ekstrak informasi data
        self.n_samples, self.n_features = X.shape

        # Buat estimator ensemble
        self.estimators_ = _generate_ensemble_estimator(base_estimator=self.estimator,
                                                        n_estimators=self.n_estimators)

        # Buat bootstrap sample
        sample_indices = _generate_sample_indices(seed=self.random_state,
                                                   n_estimators=self.n_estimators,
                                                   n_population=self.n_samples,
                                                   n_samples=self.n_samples,
                                                   bootstrap=True)

        print(sample_indices)
        if isinstance(self.max_features, int):
            max_features = self.max_features
        elif self.max_features == "sqrt":
            max_features = int(np.sqrt(self.n_features))
        elif self.max_features == "log2":
            max_features = int(np.log2(self.n_features))
        else:
            max_features = self.n_features
        # Generate random feature
        self.feature_indices = _generate_feature_indices(seed=self.random_state,
                                                         n_estimators=self.n_estimators,
                                                         n_population=self.n_features,
                                                         n_features=max_features,
                                                         bootstrap=False)
        # Fit the model
        for b in range(self.n_estimators):
            # Get the bootstrap features
            X_bootstrap = X[:, self.feature_indices[b]]

            # Get the bootstrap samples
            X_bootstrap = X_bootstrap[sample_indices[b], :]
            y_bootstrap = y[sample_indices[b]]

            # fitting
            estimator = self.estimators_[b]
            estimator.fit(X_bootstrap, y_bootstrap)

In [14]:
clf = DecisionTreeClassifier()

In [15]:
ensemble_clf = BaseEnsemble(estimator=clf,
                            n_estimators=10,
                            random_state=123)

In [16]:
ensemble_clf.fit(X_train, y_train)

[[3582 3437 3454 ...  692 2348  347]
 [2366 2396 1406 ... 1662 2766 1045]
 [1161 2666 1583 ... 2645 1850 1150]
 ...
 [1633 3404  586 ... 1379 2201 2347]
 [2490  305 3521 ... 2044 1104 1998]
 [1375 1005 1404 ... 1504 3504 1786]]


In [17]:
ensemble_clf.estimators_

[DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(),
 DecisionTreeClassifier()]

In [19]:
ensemble_clf.feature_indices

array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]])