# AdaBoosting Algorithm From Scratch

## Importing Libraries

In [7]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier

## The AdaBoost Classifier

In [21]:
class MyAdaBoostClassifier:
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
        self.alphas = []
        self.models = []
        self.n_classes = None
    
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        weights = np.full(X.shape[0], 1 / X.shape[0])
        
        for _ in range(self.n_estimators):
            class_models = []
            class_alphas = []
            
            for class_label in range(n_classes):
                # Create a binary label vector for the current class
                binary_labels = np.where(y == class_label, 1, -1)
                
                # Train a weak classifier
                model = DecisionTreeClassifier(max_depth=1)
                model.fit(X, binary_labels, sample_weight=weights)
                predictions = model.predict(X)
                
                # Calculate weighted error
                weighted_error = np.sum(weights * (predictions != binary_labels))
                
                # Calculate alpha
                alpha = 0.5 * np.log((1 - weighted_error) / (weighted_error + 1e-10)) + np.log(n_classes-1)
                class_alphas.append(alpha)
                
                # Update weights
                weights = weights * np.exp(-alpha * binary_labels * predictions)
                weights /= np.sum(weights)
                
                class_models.append(model)
            
            self.alphas.append(class_alphas)
            self.models.append(class_models)
    
    def predict(self, X):
        # Initialize scores for each class
        class_scores = np.zeros((self.n_estimators, self.n_classes, X.shape[0]))
        
        for index, (class_alphas, class_models) in enumerate(zip(self.alphas, self.models)):
            for class_label in range(self.n_classes):
                class_scores[index][class_label][:] += class_alphas[class_label] * class_models[class_label].predict(X)
        
        # Make multiclass predictions based on the highest score
        predictions = np.argmax(np.sum(np.sign(class_scores), axis=0), axis=0)

        return predictions


### Evaluation on Iris Dataset from sklearn

In [37]:
from sklearn.datasets import load_iris

# Load the Iris dataset
data = load_iris()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, shuffle=True)

# Create an AdaboostMulticlass classifier with 50 weak classifiers
n_classes = len(np.unique(y))
adaboost = MyAdaBoostClassifier(n_estimators=10)

# Fit the model on the training data
adaboost.fit(X_train, y_train)

# Make predictions on the test data
y_pred = adaboost.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9666666666666667


### Evaluation on Iris Dataset (csv file)

In [35]:
iris = pd.read_csv("iris.csv")
iris = iris.drop('Id', axis=1)

X = iris.iloc[:, 0:4]
y = iris['Species']
labels = {item: index for index, item in enumerate(np.unique(y))}
y = y.map(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.2, shuffle=True)

model = MyAdaBoostClassifier(n_estimators=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9666666666666667


## Using AdaBoost from library

In [36]:
iris = pd.read_csv("iris.csv")  
iris = iris.drop('Id', axis=1)

X = iris.iloc[:, 0:4]
y = iris['Species']
labels = {item: index for index, item in enumerate(np.unique(y))}
y = y.map(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=0.2, shuffle=True)

clf = AdaBoostClassifier(n_estimators=5)
clf.fit(X_train, y_train)
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 1.0


## AdaBoost Regressor

In [8]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

class AdaBoostRegressor:
    def __init__(self, n_estimators=50, learning_rate=1.0):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.estimator_weights = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        sample_weights = np.ones(n_samples) / n_samples  # Initialize sample weights

        for _ in range(self.n_estimators):
            # Create a new base regressor (DecisionTreeRegressor)
            base_regressor = DecisionTreeRegressor(max_depth=1)
            base_regressor.fit(X, y, sample_weight=sample_weights)

            # Predict using the current base regressor
            y_pred = base_regressor.predict(X)

            # Calculate the error and update the sample weights
            weighted_error = np.sum(sample_weights * np.abs(y - y_pred)) / (np.sum(sample_weights))
            estimator_weight = self.learning_rate * np.log((1 - weighted_error) / weighted_error)

            # Update the sample weights
            sample_weights *= np.exp(estimator_weight * (y - y_pred))

            # Normalize the sample weights
            sample_weights /= np.sum(sample_weights)

            # Append the current base regressor and its weight to the list of estimators
            self.estimators.append(base_regressor)
            self.estimator_weights.append(estimator_weight)

    def predict(self, X):
        # Initialize the predicted values
        y_pred = np.zeros(X.shape[0])

        # Make predictions using the weighted combination of base regressors
        for estimator, weight in zip(self.estimators, self.estimator_weights):
            y_pred += weight * estimator.predict(X)

        return y_pred

    
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Generate some synthetic regression data
X, y = make_regression(n_samples=100, n_features=1, noise=0.2, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the AdaBoost Regressor
adaboost_regressor = AdaBoostRegressor(n_estimators=100, learning_rate=0.1)
adaboost_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = adaboost_regressor.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")