In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import csv

# Supress Warnings
warnings.filterwarnings('ignore')

from random import randrange
from sklearn import preprocessing, tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, precision_score,
                             recall_score, f1_score)

# Read in spam dataset into a pandas dataframe
spamData = pd.read_csv('input/spambase.txt', header=None)
with open('input/spambase_names.txt', 'r') as f:
    names = list(csv.reader(f, delimiter=','))
spamData.columns = names

# Create feature and target class data set
features = spamData.drop(['class'], axis=1)
target = spamData[['class']]
spamData.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [2]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state = 0)

# Standardize the x_train and x_test datasets
std_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = std_scaler.transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

In [3]:
# Change predictions from {0,1} to {-1,1} in order to properly compute predictions
y_test_new = np.where(y_test == 0, -1, 1)
y_train_new = np.where(y_train == 0, -1, 1)

# Class for AdaBoost implementation
class AdaBoostCLF:

    def __init__(self, X_train, y_train, T):
        self.x_train = X_train
        self.y_train = list(np.array(y_train).flatten())
        self.T = T
        self.N = len(self.x_train)
        self.weights =  np.ones(self.N)/self.N
        self.stumps = []  
        self.alphas = []

    def train(self):
        for t in range(self.T):
            dtc =  DecisionTreeClassifier(criterion = "gini", splitter = "best", max_depth = 1)
            stump = dtc.fit(self.x_train, self.y_train, sample_weight = self.weights)
            pred_train = dtc.predict(self.x_train)
            
            # Compute misclassified instances, errors, and alphas
            incorrect = np.where(pred_train != self.y_train, 1, 0)
            error = np.dot(self.weights, incorrect )/(self.weights).sum(dtype = 'float')
            alpha_t = 0.5 * np.log((1 - error) / error)
            
            # Compute the corresponding pos/neg coeffs to allow for proper weight updates
            weight_coeffs =  np.where(incorrect == 1 , 1, -1)
            weights = np.multiply(self.weights, np.exp(alpha_t * weight_coeffs))
            norm_weights = (weights/weights.sum(dtype = 'float'))
            
            # Update weights, stumps, and betas for current t iteration 
            self.weights = norm_weights
            self.stumps.append(stump)
            self.alphas.append(alpha_t)
            
    def computePredictions(self, X):
            num_classifiers = self.T
            num_records = len(X)
            
            # Initialize a list to hold the prediction* the corresponding alpha_t weights a*h(x) for each iteration
            alpha_preds = np.zeros((num_records, num_classifiers))
            
            # Iterate through all classifiers and all given records and add prediction 
            for c in range(num_classifiers):
                curr_clf = self.stumps[c] 
                curr_preds = curr_clf.predict(X)
                for i in range(num_records):
                    alpha_preds[i][c] = curr_preds[i] * self.alphas[c]
            final_preds = []
            
            # Iterate through each record to compute hypothesis across all T classifiers
            for tr in range(num_records):
                if np.sign(sum(alpha_preds[tr])) <= 0:
                    final_preds.append(-1)
                else:
                    final_preds.append(1)
            return final_preds

In [16]:
# Initialize different iteration values to try for T
T_values = [1, 50, 100, 150]

# Initialize lists to hold training and testing scores
train_scores = []
test_scores = []

# Function to compute all scores itertatively across various T iterations
def computeScores(X_train, y_train, X_test, y_test, T_values):
    for i in range(0, len(T_values)):
        clf = AdaBoostCLF(X_train, y_train, T_values[i])
        clf.train()
        preds_train = clf.computePredictions(X_train)
        preds_test = clf.computePredictions(X_test)
        acc_score_tr = accuracy_score(y_train,preds_train)
        acc_score_tst = accuracy_score(y_test, preds_test)
        err_score_tr = 1 - acc_score_tr
        err_score_tst = 1 - acc_score_tst
        precision_tr =  precision_score(y_train, preds_train, pos_label = 1)
        precision_tst =  precision_score(y_test, preds_test, pos_label = 1)
        recall_tr =  recall_score(y_train, preds_train, pos_label = 1)
        recall_tst =  recall_score(y_test, preds_test, pos_label = 1)
        train_scores.extend(("", acc_score_tr, err_score_tr, precision_tr, recall_tr))
        test_scores.extend(("", acc_score_tst, err_score_tst, precision_tst, recall_tst))

# Call the function to compute scores and populate lists
computeScores(X_train_scaled, y_train_new, X_test_scaled, y_test_new, T_values)

# Create dictionary of test/train metrics
scores_dict = {'training' :train_scores, 'testing' : test_scores } 

# Convert all scores to dataframe
scoresDF = pd.DataFrame.from_dict(scores_dict)
row_names = ['1 decision stump', 'accuracy:', 'error:', 'precision:','recall:',
             '50 decision stumps', 'accuracy:', 'error:', 'precision:','recall:',
            '100 decision stumps', 'accuracy:', 'error:', 'precision:','recall:',
            '150 decision stumps', 'accuracy:', 'error:', 'precision:','recall:']

scoresDF.index = row_names
scoresDF

Unnamed: 0,training,testing
1 decision stump,,
accuracy:,0.791884,0.789748
error:,0.208116,0.210252
precision:,0.717914,0.726141
recall:,0.773097,0.76087
50 decision stumps,,
accuracy:,0.935072,0.922676
error:,0.0649275,0.0773241
precision:,0.935235,0.955774
recall:,0.896526,0.845652
