In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from functools import partial

def calculate_error(y, y_pred, w):
    #TODO : Calculate the weighted error of a weak classifier.

     return (sum(w * (np.not_equal(y, y_pred)).astype(int)))/sum(w)

def calculate_alpha(error):
    #TODO : Calculate the weight of a weak classifier.
    return np.log((1-error)/error)/2

def update_weights(w, alpha, y, y_pred):
    #TODO : Update weights after a boosting iteration.
    return w*np.exp(alpha*(np.not_equal(y,y_pred)).astype(int))


def classify(X, y, maxitercnt=10000):
    rows, cols = X.shape
    v = np.zeros(cols + 1)  # Initialize weights (including bias)
    w = np.zeros(cols + 1)  # Best weights found
    best_accuracy = 0
    ones_column = np.ones((X.shape[0], 1))
    new_X = np.hstack((ones_column, X))  # Add bias term

    np.random.seed(42)

    for itercnt in range(maxitercnt):
        random_index = np.random.choice(rows)
        xj, yj = new_X[random_index], y[random_index]
        yhat = np.sign(np.dot(v, xj))

        # Update weights if there's a misclassification
        if yhat != yj:
            v += yj * xj

        # Evaluate current weight vector on the full dataset
        predictions = np.sign(np.dot(new_X, v))
        accuracy = np.mean(predictions == y)

        # If this weight vector performs better, save it
        if accuracy > best_accuracy:
            w = v.copy()
            best_accuracy = accuracy

    return w


def predict(X, weights):
    arr = np.dot(X, weights[1:]) + weights[0]
    return np.sign(arr)


class AdaBoost:
    
    def __init__(self):
        self.alphas = []
        self.G_M = []
        self.training_errors = []

    def fit(self, X, y, M = 100):
        
        self.alphas = [] 
        self.training_errors = []
        self.M = M

        for m in range(0, M):
            
            # Set weights
            if m == 0:
                #TODO
                w=np.ones(len(y))*1/len(y)

                 
            else:
                #TODO
                w=update_weights(w,alpha_m,y,y_pred)
               
            
            # 1. Fit weak classifier and predict labels(using predict method) and Save it to list of weak classifiers.
            #TODO
            #simple tree with max depth 1
            G_m=DecisionTreeClassifier(max_depth=1)
            #fit
            G_m.fit(X,y,w)
            
            y_pred=G_m.predict(X)
            #add weak learner to the list of weak learners
            self.G_M.append(G_m)


            # 2. Calculate error of this weak classifier and save it to list of trainingterrors.
            #TODO
            
            error_m=calculate_error(y,y_pred,w)
            self.training_errors.append(error_m)

            # 3. Calculate alpha of this weak classifier and save it to list of alphas.
            #TODO
            alpha_m=calculate_alpha(error_m)
            self.alphas.append(alpha_m)

    def predict(self, X):

        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M)) 

        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
            weak_preds[weak_preds.columns[m]] = y_pred_m

        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

In [8]:
banana = np.loadtxt('banana_data.csv', delimiter=',')
X_banana, y_banana = banana[:, 1:], banana[:, 0]
# X_banana_train, y_banana_train = X_banana[:400], y_banana[:400]
# X_banana_test, y_banana_test = X_banana[400:], y_banana[400:]
X_banana_train, X_banana_test, y_banana_train, y_banana_test = train_test_split(X_banana, y_banana, test_size=4900, train_size=400)

In [41]:
splice = np.loadtxt('splice_data.csv', delimiter=',')
X_splice, y_splice = splice[:, 1:], splice[:, 0]
# X_splice_train, X_splice_test, y_splice_train, y_splice_test = train_test_split(X_splice, y_splice, test_size=1991, train_size=1000)

X_splice, y_splice = splice[:, 1:], splice[:, 0]
X_splice_train, y_splice_train = X_splice[:1000], y_splice[:1000]
X_splice_test, y_splice_test = X_splice[1000:], y_splice[1000:]

In [22]:
#TODO : Fit model
model=AdaBoost()
model.fit(X_banana_train, y_banana_train, 1000)

#TODO : Predict on test data
y_pred=model.predict(X_banana_test)

#TODO : Print evaluation metrics (Accurcy, Precission, Recall and f-score)
import sklearn.metrics as metrics

accuracy=metrics.accuracy_score(y_banana_test, y_pred)
precission=metrics.precision_score(y_banana_test,y_pred)
recall=metrics.recall_score(y_banana_test,y_pred)
f1=metrics.f1_score(y_banana_test,y_pred)
print(accuracy)
print(precission)
print(recall)
print(f1)


0.7228571428571429
0.7204870301746956
0.6211775445002282
0.667156862745098


In [42]:
model=AdaBoost()
model.fit(X_splice_train, y_splice_train, 1000)

#TODO : Predict on test data
y_pred=model.predict(X_splice_test)

#TODO : Print evaluation metrics (Accurcy, Precission, Recall and f-score)
import sklearn.metrics as metrics

accuracy=metrics.accuracy_score(y_splice_test, y_pred)
precission=metrics.precision_score(y_splice_test,y_pred)
recall=metrics.recall_score(y_splice_test,y_pred)
f1=metrics.f1_score(y_splice_test,y_pred)
print(accuracy)
print(precission)
print(recall)
print(f1)

0.9271722752385736
0.9404630650496141
0.9036016949152542
0.9216639654240951


In [49]:
mine = DecisionTreeClassifier(max_depth=1)
w=np.ones(len(y_splice_train))*1/len(y_splice_train)
mine.fit(X_splice_train, y_splice_train, w)
y_pred = mine.predict(X_splice_test)
metrics.accuracy_score(y_splice_test, y_pred)

0.7825213460572576

In [50]:
mine = DecisionTreeClassifier(max_depth=1)
w=np.ones(len(y_banana_train))*1/len(y_banana)
mine.fit(X_banana_train, y_banana_train, w)
y_pred = mine.predict(X_banana_test)
metrics.accuracy_score(y_banana_test, y_pred)

0.5826530612244898