# Ensemble Methods
Descision Trees are high in variance, a small change in data can lead to overall change in structure of the tree.  
So to tackle this problems various ensemble Methods are used.  
  
We will discuss three commonly used ensemble Methods:-  
1. Bagging (Bootstrap Aggregator)
2. Boosting
3. Stacking

## Bagging
In bagging we try to train multiple descision trees, each tree will be trained on unique dataset, though the dataset is the same as original dataset but some values will be replaced by duplicate values, hence creating a different view for each tree or we can say each tree will be biased.  
Then during prediction each tree will predict, and the major vote will be chosen(in case of classification) or average will be taken (in case of regression).  
Random Forest is an example of Bagging.

<i><u>Bagging is not entitled to Decision Trees, Descision trees are high in variance so Bagging helps a lot, so we used bagging with Descision Trees here.</i></u>

In [45]:
import numpy as np

class BaggingClassifier:
    def __init__(self,base_classifier_function, n_estimator=10):
        self.base_classifier = base_classifier_function
        self.n_estimator = n_estimator
        self.models = []
    
    def _bagging_samples(self, X, Y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[indices], Y[indices]
    
    def fit(self, X, Y):
        self.models = []
        for _ in range(self.n_estimator):
            model = self.base_classifier()
            new_X, new_Y = self._bagging_samples(X,Y)
            model.fit(new_X, new_Y)
            self.models.append(model)
    
    def predict(self, X):
        preds = np.array([model.predict(X) for model in self.models])
        final_preds = []
        for each_row in preds.T:
            values, count = np.unique(each_row, return_counts=True)
            final_preds.append(values[np.argmax(count)])
        return np.array(final_preds)


In [46]:
import pandas as pd
df = pd.read_csv("D:/Ml/part1/data/diabetes.csv")
X = df.drop(columns=["Outcome"]).values
Y = df['Outcome'].values
type(X)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

In [47]:
from sklearn.tree import DecisionTreeClassifier
def new_tree():
    return DecisionTreeClassifier()
model = BaggingClassifier(new_tree)
model.fit(X_train, Y_train)

In [48]:
#calculating predicted values
Y_pred = model.predict(X_test)
Y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [52]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.7857142857142857

In [53]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_pred)

array([[87, 12],
       [21, 34]])

## Boosting
In Boosting the idea is to train weak learners( weak learners are one depth decision tree), these weak learners then together predict the output (unlike bagging where each indepently predicts), the impact of each weak learner is decided by its weights, more the weight greater the contribution.  
Weak learners are called decision stumps because it’s a “tiny tree with just a stump” (no branches beyond the first split).

Each weak learner tries to learn from previous learner, focusing more on what it was missing.


In [4]:
import numpy as np

class Desicionstump:
    def __init__(self):
        self.min_error = float('inf')
        self.p = None
        self.f_id = None
        self.threshold = None
        self.alpha = None

    def fit(self,X,Y, weights):
        rows, columns = X.shape
        for feature_id in range(columns):
            sort_ids = np.argsort(X[:,feature_id])
            sortedX = X[sort_ids, feature_id]
            sortedY = Y[sort_ids]
            for i in range(1,rows):
                for polarity in [1,-1]:
                    current_threshold = (sortedX[i]+sortedX[i-1])/2
                    if polarity == 1:
                        pred = [1 if each<current_threshold else -1 for each in sortedX ]
                    else:
                        pred = [-1 if each<current_threshold else 1 for each in sortedX ]
                    weighted_error = np.sum(weights*(pred!=sortedY))
                    if weighted_error<self.min_error:
                        self.min_error = weighted_error
                        self.f_id = feature_id
                        self.threshold = current_threshold
                        self.p = polarity
        
        

    def predict(self, X):
        Y_pred = self.p * np.array([1 if each<self.threshold else -1 for each in X[:,self.f_id]])
        return Y_pred

In [19]:
class Adaboost:
    def __init__(self, n_estimators: int = 100):
        self.n_estimators = n_estimators
        self.learners = []

    def fit(self, X, Y):
        Y = np.where(Y<=0, -1, 1)
        rows, columns = X.shape
        self.learners = []
        weights = np.ones(rows)/rows
        for _ in range(self.n_estimators):
            stump = Desicionstump()
            stump.fit(X,Y,weights=weights)
            stump.alpha = 0.5*(np.log((1-stump.min_error)/(stump.min_error + 1e-10)))
            final_pred = stump.predict(X)
            weights = weights*np.exp(-1*stump.alpha*final_pred*Y)
            weights /= np.sum(weights) # Normalized weights
            self.learners.append(stump)

    def predict(self, X):
        predictions = []
        for learners in self.learners:
            stump_pred = learners.alpha*learners.predict(X)
            predictions.append(stump_pred)
        final_pred = np.sign(np.sum(predictions, axis=0))
        final_pred = np.where(final_pred<=0, 0, 1)
        return final_pred
            

In [20]:
import pandas as pd
df = pd.read_csv("D:/Ml/part1/data/diabetes.csv")
X = df.drop(columns=["Outcome"]).values
Y = df['Outcome'].values
type(X)

numpy.ndarray

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

In [34]:
model = Adaboost(n_estimators=10)
model.fit(X_train, Y_train)

In [35]:
#calculating predicted values
Y_pred = model.predict(X_test)
Y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.7402597402597403

## Stacking
Idea is simple multiple different models predicting and a boss model is watching their predictions and learning from their prediction and actual answer.

We use various folds to train and test each model so that models predict on unseen data.

In [9]:
import numpy as np
class StackingClassifier:
    def __init__(self, base_models: list, meta_model, folds: int):
        self.models = base_models
        self.folds = folds
        self.meta_model = meta_model

    def fit(self, X, Y):
        n_samples = X.shape[0]
        batch_size = n_samples//self.folds
        X_final = np.zeros((n_samples, len(self.models)))
        for i,model in enumerate(self.models):
            cummaltive_pred = np.array([])
            for fold in range(self.folds):
                start = fold*batch_size
                end = (fold+1)*batch_size
                if fold == self.folds-1:
                    X_test = X[start:]
                else:
                    X_test = X[start:end]
                X_train = np.concatenate([X[:start],X[end:]], axis=0)
                Y_train = np.concatenate([Y[:start],Y[end:]], axis=0)
                model.fit(X_train, Y_train)
                pred = model.predict(X_test)
                cummaltive_pred = np.concatenate([cummaltive_pred, pred])

            model.fit(X,Y)
            X_final[:,i] = cummaltive_pred
        self.meta_model.fit(X_final,Y)
        


    def predict(self, X):
        n_sample = X.shape[0]
        X_final = np.zeros((n_sample,len(self.models)))
        for i,model in enumerate(self.models):
            pred = model.predict(X)
            X_final[:,i] = pred
        final_pred = self.meta_model.predict(X_final)
        return final_pred

In [3]:
import pandas as pd
df = pd.read_csv("D:/Ml/part1/data/diabetes.csv")
X = df.drop(columns=["Outcome"]).values
Y = df['Outcome'].values
type(X)

numpy.ndarray

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
models = [LogisticRegression(), SVC(), AdaBoostClassifier(), RandomForestClassifier()]
model = StackingClassifier(base_models=models, meta_model=LogisticRegression(), folds=4)
model.fit(X_train, Y_train)

In [6]:
#calculating predicted values
Y_pred = model.predict(X_test)
Y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [7]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.7467532467532467

In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_pred)

array([[80, 19],
       [20, 35]])