In [190]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.utils.validation import check_array
from sklearn.metrics import accuracy_score, confusion_matrix

In [267]:
class CobraClassifier(BaseEstimator):
    def __init__(self, estimators = [DecisionTreeClassifier(max_depth = 3)]*4, random_state = 0):
        self.random_state = random_state
        self.estimators = estimators
        
        self.n_machines = len(estimators)
        
    def fit(self, X, y, sample_weights = None, split = 0.5):
        l = len(X)
        k = split*l if split<=1 else split
        k = int(k)
        
        self.X_k, self.y_k = X[:k], y[:k]
        self.X_l, self.y_l = X[k:l].reset_index(drop=True), y[k:l].reset_index(drop=True)
        self.sample_weights = sample_weights
        
        self.train()
        self.fit_cobra()
        
        return self
    
    def predict(self, X):
        X = np.array(X)
        M = len(self.estimators)
        results = np.zeros(len(X))
        avg_points = 0
        index = 0
        
        for sample in X:
            results[index] = self.pred(sample.reshape(1, -1))
            index+=1
        
        return results
        
    def train(self):
        for estimator in self.estimators:
            estimator.fit(self.X_k, self.y_k, sample_weight = self.sample_weights)
            
        return self
    
    def fit_cobra(self):
        self.machine_predictions = [None]*self.n_machines
        for i in range(self.n_machines):
            self.machine_predictions[i] = self.estimators[i].predict(self.X_l)
            
        return self

    def pred(self, X):
        n_machines = self.n_machines
        M = n_machines
        
        select = [set()]*n_machines
        for i in range(n_machines):
            label = self.estimators[i].predict(X)
            for point in range(len(self.X_l)):
                if self.machine_predictions[i][point] == label:
                    select[i].add(point)
                    
        points = []
        for sample in range(len(self.X_l)):
            row_check = 0
            for i in range(n_machines):
                if sample in select[i]:
                    row_check+=1

            if row_check == M:
                points.append(sample)


        if len(points) == 0:
            print('No Points found')
            return 2


        classes = {}
        for label in np.unique(self.y_l):
            classes[label] = 0

        for point in points:
            classes[self.y_l[point]] += 1

        result = int(max(classes, key = classes.get))
        return result

In [142]:
class AdaBoost:
    def __init__(self, weak_learner = DecisionTreeClassifier(max_depth = 3)):
        self.alphas = []
        self.weaks = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []
        self.weak_learner = weak_learner
    
    def fit(self, X, y, M = 100):
        self.alphas = []
        self.training_errors = []
        
        for m in range(M):
            if m==0:
                w_i = np.ones(len(y)) * 1/len(y)
            else:
                w_i = self.update_weights(w_i, alpha_m, y, y_pred)
                
            
            weak = sklearn.base.clone(self.weak_learner)
            weak.fit(X, y, sample_weight = w_i)
            
            y_pred = weak.predict(X)
            self.weak.predict(X)
            
            self.weaks.append(weak)
            
            error_m = self.compute_error(y, y_pred, w_i)
            self.training_errors.append(error_m)
            
            alpha_m = self.alpha(error_m)
            self.alphas.append(alpha_m)
            
    def predict(self, X):
#         X = np.array(X)
        weak_preds = pd.DataFrame(index = len(X), columns = range(self.M))
        
        for m in range(self.M):
            y_pred_m = self.weaks[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:, m] = y_pred_m
            
        y_pred = (1*np.sign(weak_preds.T.sum())).astype(int)
        return y_pred
            
    def compute_error(self, y, y_pred, w_i):
        return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)

    def alpha(self, error):
        return np.log((1 - error) / error)

    def update_weights(self, w_i, alpha, y, y_pred):
        return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))


In [268]:
df = pd.read_csv('haberman.csv', header=None)

In [269]:
X = df.drop(3, axis=1)
Y = df.iloc[:, 3]
Y = Y.apply(lambda x: x-1)

X_train, Y_train = X[:235], Y[:235] 
X_test, Y_test = X[235:].reset_index(drop=True), Y[235:].reset_index(drop=True)

In [271]:
cc = CobraClassifier()

In [272]:
cc.fit(X_train, Y_train)

CobraClassifier()

In [273]:
Y_preds = cc.predict(X_test)

In [274]:
accuracy_score(Y_preds, Y_test)

0.7323943661971831

In [315]:
dct1 = DecisionTreeClassifier(max_depth=110)
dct2 = DecisionTreeClassifier(max_depth=20)
dct3 = DecisionTreeClassifier(max_depth=20)

cc = CobraClassifier(estimators = [dct1, dct2, dct3])

In [316]:
cc.fit(X_train, Y_train)
Y_preds = cc.predict(X_test)
accuracy_score(Y_preds, Y_test)

0.7323943661971831

In [305]:
def simulate(model):
    model.fit(X_train[:117], Y_train[:117])
    Y_preds = model.predict(X_test)
    print('Confusion Matrix: ') 
    print(confusion_matrix(Y_preds, Y_test))
    print('Accuracy Score: ' + str(accuracy_score(Y_preds, Y_test)))

In [310]:
simulate(DecisionTreeClassifier(max_depth=10))

Confusion Matrix: 
[[10  8]
 [42 11]]
Accuracy Score: 0.29577464788732394
