# AdaBoosting Algorithm From Scratch

## Importing Libraries

In [16]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from pmlb import fetch_data

## The AdaBoost Classifier

In [2]:
class MyAdaBoostClassifier:
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
        self.alphas = []
        self.models = []
        self.n_classes = None
    
    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        weights = np.full(X.shape[0], 1 / X.shape[0])
        
        for _ in range(self.n_estimators):
            class_models = []
            class_alphas = []
            
            for class_label in range(self.n_classes):
                # Create a binary label vector for the current class
                binary_labels = np.where(y == class_label, 1, -1)
                
                # Train a weak classifier
                model = DecisionTreeClassifier(max_depth=1)
                model.fit(X, binary_labels, sample_weight=weights)
                predictions = model.predict(X)
                
                # Calculate weighted error
                weighted_error = np.sum(weights * (predictions != binary_labels))
                
                # Calculate alpha
                alpha = 0.5 * np.log((1 - weighted_error) / (weighted_error + 1e-10)) + np.log(self.n_classes-1)
                class_alphas.append(alpha)
                
                # Update weights
                weights = weights * np.exp(-alpha * binary_labels * predictions)
                weights /= np.sum(weights)
                
                class_models.append(model)
            
            self.alphas.append(class_alphas)
            self.models.append(class_models)
    
    def predict(self, X):
        # Initialize scores for each class
        class_scores = np.zeros((self.n_estimators, self.n_classes, X.shape[0]))
        
        for index, (class_alphas, class_models) in enumerate(zip(self.alphas, self.models)):
            for class_label in range(self.n_classes):
                class_scores[index][class_label][:] += class_alphas[class_label] * class_models[class_label].predict(X)
        
        # Make multiclass predictions based on the highest score
        predictions = np.argmax(np.sum(np.sign(class_scores), axis=0), axis=0)

        return predictions

## Evaluating Our Model

### Iris Dataset

In [3]:
iris = pd.read_csv("iris.csv")
iris = iris.drop('Id', axis=1)

X1 = iris.iloc[:, 0:4]
y1 = iris['Species']
labels = {item: index for index, item in enumerate(np.unique(y1))}
y1 = y1.map(labels)

X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=10, test_size=0.2, shuffle=True)

model = MyAdaBoostClassifier(n_estimators=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9666666666666667


### Breast Cancer Wisconsin (Diagnostic) Dataset

In [17]:
X2, y2 = fetch_data('breast_cancer', return_X_y=True, local_cache_dir='./')

X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=10, test_size=0.2, shuffle=True)

model = MyAdaBoostClassifier(n_estimators=30)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7586206896551724


### AnnThyroid Dataset

In [6]:
# Returns a pandas DataFrame
X3, y3 = fetch_data('ann_thyroid', return_X_y=True, local_cache_dir='./')

conditions = [y3 == 1, y3 == 2, y3 == 3]
values = [0, 1, 2]

y3 = np.select(conditions, values, default=y3)

X_train, X_test, y_train, y_test = train_test_split(X3, y3, random_state=10, test_size=0.2, shuffle=True)

model = MyAdaBoostClassifier(n_estimators=10)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9756944444444444


## Using AdaBoosting from library

### Iris Dataset

In [8]:
iris = pd.read_csv("iris.csv")  
iris = iris.drop('Id', axis=1)

X1 = iris.iloc[:, 0:4]
y1 = iris['Species']
labels = {item: index for index, item in enumerate(np.unique(y1))}
y1 = y1.map(labels)

X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=10, test_size=0.2, shuffle=True)

clf = AdaBoostClassifier(n_estimators=5)
clf.fit(X_train, y_train)
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 1.0


### Breast Cancer Wisconsin (Diagnostic) Dataset

In [18]:
X2, y2 = fetch_data('breast_cancer', return_X_y=True, local_cache_dir='./')

X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=10, test_size=0.2, shuffle=True)

model = AdaBoostClassifier(n_estimators=30)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7586206896551724


### AnnThyroid Dataset

In [20]:
# Returns a pandas DataFrame
X3, y3 = fetch_data('ann_thyroid', return_X_y=True, local_cache_dir='./')

conditions = [y3 == 1, y3 == 2, y3 == 3]
values = [0, 1, 2]

y3 = np.select(conditions, values, default=y3)

X_train, X_test, y_train, y_test = train_test_split(X3, y3, random_state=10, test_size=0.2, shuffle=True)

model = AdaBoostClassifier(n_estimators=10)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9888888888888889


## Conclusion

| Dataset        | MyAdaBoost | Adaboost     |
|----------------|------------|--------------|
| Iris           | 96.67%     | 100%         |
| Breast Cancer  | 75.86%     | 75.86%       |
| Ann Thyroid    | 97.57%     | 98.89%       |

Our Adaboost implementation is giving really close accuracy while comparing with the adaboost from sklearn library