In [7]:
import numpy as np
from sklearn import datasets


In [85]:
iris = datasets.load_iris()
X = iris.data[:, :]  # we only take the first two features.
Y = iris.target
# print(X.T.shape)
# print(Y.T.shape)
dataset = np.concatenate([X.T, [Y.T]]).T
# print(dataset.shape)

In [117]:
class DecisionTreeClassifierCustom():
    def __init__(self, max_depth=10, min_count=10, is_binary=False):
        self.tree = None
        self.max_depth = max_depth
        self.min_count = min_count
        self.train_x = None
        self.train_y = None
        self.is_binary = is_binary
        
    def train(self, X, y):
        self.train_x = [(x, i) for i, x in enumerate(X)]
        self.train_y = y
        self.n_features = X.shape[1]
        root = self._get_branch(self.train_x, self.train_y)
        self._branch(root, self.train_y, 1)
        self.tree = root
        
    def fit(self, X, y):
        return self.train(X, y)
    
    def score(self, X, Y):
        predictions = self.predict(X)
        correct = 0
        for i in range(len(Y)):
            if Y[i] == predictions[i]:
                correct += 1
        return (correct/float(len(Y)))
    
    def predict(self, X_test):
        pred_y = np.empty(X_test.shape[0])
        for i, obs in enumerate(X_test):
            pred_y[i] = self._predict_helper(self.tree, obs)
        return pred_y
            
    def _predict_helper(self, node, obs):
        if obs[node['index']] <= node['value']:
            if isinstance(node['left'], dict):
                return self._predict_helper(node['left'], obs)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self._predict_helper(node['right'], obs)
            else:
                return node['right']
    
    def _get_branch(self, X, y):
        classes = list(np.unique(y))
#         print('classes', classes)
        b_index, b_value, b_score, b_groups = np.inf, np.inf, np.inf, None
        for index in range(self.n_features):
            for row, idx in X:
                groups = self._test_split(index, 0.5 if self.is_binary else row[index], X)
                gini = self._get_gini(groups, classes, y)
                if gini < b_score:
#                     print(gini)
                    b_index, b_value, b_score, b_groups = index, 0.5 if self.is_binary else row[index], gini, groups
#                     print(b_index, b_value, b_score)
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    
    def _test_split(self, index, value, X):
        less = []
        more = []
        for row, idx in X:
            if row[index] <= value:
                less.append((row, idx))
            else:
                more.append((row, idx))
        return less, more
    
    def _get_gini(self, groups, classes, y):
        gini = 0.0
        for cls in classes:
            for group in groups:
                size = len(group)
                if size == 0:
#                     gini += 0.025
                    continue
                proportion = [y[x[1]] for x in group].count(cls) / float(size)
                gini += (proportion * (1.0 - proportion))
        return gini
    
    def _branch(self, node, y, depth):
        left, right = node['groups']
        del(node['groups'])
        # check for a no split
        if not left or not right:
            node['left'] = self._to_leaf(left + right, y)
            node['right'] = self._to_leaf(left + right, y)
            
            return
        # check for max depth
        if depth >= self.max_depth:
            node['left'], node['right'] = self._to_leaf(left, y), self._to_leaf(right, y)
            return
        # process left child
        if len(left) <= self.min_count:
            node['left'] = self._to_leaf(left, y)
        else:
            node['left'] = self._get_branch(left, y)
            self._branch(node['left'], y, depth+1)
        # process right child
        if len(right) <= self.min_count:
            node['right'] = self._to_leaf(right, y)
        else:
            node['right'] = self._get_branch(right, y)
            self._branch(node['right'], y, depth+1)
            
    def _to_leaf(self, group, y):
        outcomes = [y[x[1]] for x in group]
        return max(set(outcomes), key=outcomes.count)

In [199]:
import csv
import random
import math

def separate_for_boosting(X, Y, preds):
    correct = np.array([(x,y) for x, y, p in zip(X, Y, preds) if y == p])
    incorrect = np.array([(x,y) for x, y, p in zip(X, Y, preds) if y != p])
    correct, incorrect = correct.T, incorrect.T
    return correct, incorrect
        

def train_test_split(*args, ratio=0.2):
    assert len(args) > 0
    split_index = int(round(len(args[0]) * (1-ratio)))
    train = [arg[:split_index] if arg is not None else None for arg in args]
    test = [arg[split_index:] if arg is not None else None for arg in args]
    return train, test


def load_csv(filename):
    # loads the csv by given filename
    lines = list(csv.reader(open(filename, "r")))
    headers = lines[0]
    index = [line[0] for line in lines[1:]]
    dataset = [line[1:] for line in lines[1:]]
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def split_dataset(dataset, splitRatio):
    # splits the dataset by the given ratio
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

def separateByClass(dataset):
    # takes the last col as Y and groups the dataframe by that col
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def summarize(dataset):
    
    summaries = [(np.mean(attribute), np.std(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarizeByClass(dataset):
    # separates the dataset into each class and then stores the summary (mean, std) of each variable in the dict for every class in the dataset
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

def calculateProbability(x, mean, stdev):
#     print(stdev)
    if stdev == 0: stdev = 0.0000001
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent


class GenericNB():
    def __init__(self):
        self.summaries = dict
        
    def fit(self, X, Y):
#         print(X.shape, Y.shape)
        self.summaries = summarizeByClass(dataset)
    
    def predict(self, X):
        predictions = []
        for i in range(len(X)):
            result = self._predict(X[i])
            predictions.append(result)
        return predictions
    
    def predict_proba(self, X):
        return np.array([list(self.calculateClassProbabilities(x).values()) for x in X])
    
    def score(self, X, Y):
        predictions = self.predict(X)
        correct = 0
        for i in range(len(Y)):
            if Y[i] == predictions[i]:
                correct += 1
        return (correct/float(len(Y)))
    
    def _predict(self, X):
        probabilities = self.calculateClassProbabilities(X)
        bestLabel, bestProb = None, -1
        for classValue, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = classValue
        return bestLabel
    
    def calculateClassProbabilities(self, X):
        probabilities = {}
        for classValue, classSummaries in self.summaries.items():
            probabilities[classValue] = 1
            for i in range(len(classSummaries)):
                mean, stdev = classSummaries[i]
                x = X[i]
                probabilities[classValue] *= calculateProbability(x, mean, stdev)
        return probabilities
    
    
    
    
# def computeError(pred, Y, weights):

        
# def boost(examples, weakLearner, rounds):
#    distr = normalize([1.] * len(examples))
#    hypotheses = [None] * rounds
#    alpha = [0] * rounds
 
#    for t in range(rounds):
#       def drawExample():
#          return examples[draw(distr)]
 
#       hypotheses[t] = weakLearner(drawExample)
#       hypothesisResults, error = computeError(hypotheses[t], examples, distr)
 
#       alpha[t] = 0.5 * math.log((1 - error) / (.0001 + error))
#       distr = normalize([d * math.exp(-alpha[t] * h)
#                          for (d,h) in zip(distr, hypothesisResults)])
#       print("Round %d, error %.3f" % (t, error))
 
#    def finalHypothesis(x):
#       return sign(sum(a * h(x) for (a, h) in zip(alpha, hypotheses)))
 
#    return finalHypothesis


import itertools
import sys, os, pickle

def gen_folds(df,n_splits=10):
    step = math.ceil(len(df) / n_splits)
    size = len(df)
    folds = []
    random_index = np.floor(np.random.rand(len(df))*len(df)).astype(int)
    for loc in range(0,size,step):
        test_indexer = range(loc,size) if loc+step > size else range(loc,loc+step)
        train_indexer = list(itertools.chain(range(0,loc),range(loc+step,size)))
        folds.append([random_index[train_indexer],random_index[test_indexer]])
    return folds

def bootstrap_resample(X, n=None, n_datasets=100):
    if n == None:
        n = len(X)
    
    for i in range(n_datasets):
        resample_i = np.floor(np.random.rand(n)*len(X)).astype(int)
        X_resample = np.array(X[resample_i])
        yield X_resample

def get_ensemble_binary_classification(forest, X):
    pred_y = np.zeros(shape=(len(X),))
    for t in forest:
        temp = t.predict(X)
        pred_y += temp
    pred_y = pred_y / len(forest)
    return np.array([1 if x > 0.5 else 0 for x in pred_y ])

def accuracy_score(Y, predictions):
    correct = 0
    for i in range(len(Y)):
        if Y[i] == predictions[i]:
            correct += 1
    return (correct/float(len(Y)))


def map_continuous(f):
    if f < -0.01: return 0
    elif -0.01 < f < 0.01: return 1
    else: return 2
    
def bin_continuous(arr):
    return np.array([map_continuous(a) for a in arr])

def main(dataset_name="iris", model_name='both', verbose=2, folds=10, ensemble_size=9, bootstrap=True, save_name=None, load_name=None):
    probas = []
    iter_weights = []
    global dataset
    splitRatio = 0.67
    dataset, X, Y = None, None, None
    if dataset_name == 'finance':
        filename = 'dataframe.csv'
        dataset = np.array(load_csv(filename))
        dataset = dataset[:200]
        print(dataset.shape)
        X = dataset.T[1:].T
        Y = dataset.T[0].T
        Y = bin_continuous(Y)
    else:
        iris = datasets.load_iris()
        X = iris.data[:, :4]  # we only take the first 4 features.
        Y = iris.target
        dataset = np.concatenate([X.T, [Y.T]]).T
        
    
    ensemble_2 = np.array([])
    train_errs = []
    test_errs = []
    load_fail = True
    
    if load_name != None:
        load_fail = False
        file = os.getcwd()
        file += '/saves/' + load_name + '.pickle'
        try:
            with open(file, 'rb') as load_file:
                ensemble_2 = pickle.load(load_file)
            if verbose > 0:
                print('Ensemble loaded.')
            if verbose > -1:
                for train_idx, test_idx in gen_folds(X, n_splits=1):
                    X_train, X_test = X[train_idx], X[test_idx]
                    y_train, y_test = Y[train_idx], Y[test_idx]
                    test_preds = get_ensemble_binary_classification(ensemble_2, X_test)
                    test_acc = accuracy_score(y_test, test_preds)
                    print("Cross-fold Ensemble Acc: ", test_acc)
        except:
            if verbose > -1:
                print('Failed to load ensemble from %s' % file)
            load_fail = True
            
    if load_fail:
        print('Constructing new ensemble')
        for fold_round, (train_idx, test_idx) in enumerate(gen_folds(X, n_splits=folds)):
            if verbose > 0:
                print('')
                print("Running Fold %s" % (fold_round+1))
            ensemble = []
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = Y[train_idx], Y[test_idx]
            
            # trains model on non bootstrapped data first
            if verbose > 1:
                print('')
            if model_name in ['decision tree', 'both']:
                clf = DecisionTreeClassifierCustom()
                clf.fit(X_train, y_train)
                ensemble.append(clf)
                if verbose > 1:
                    print("Trained Classifer %s with score %1.4f" % ('Decision Tree', clf.score(X_test, y_test)))

            if model_name in ['naive bayes', 'both']:
                nb = GenericNB()
                nb.fit(X_test,y_train)
                ensemble.append(nb)
                if verbose > 1:
                    print("Trained Classifer %s with score %1.4f" % ('Naive Bayes', nb.score(X_test, y_test)))

            if bootstrap == True:
                if verbose > 0:
                    print("Bootstrapping...")
                for boot_round, sample in enumerate(bootstrap_resample(np.asmatrix(X_train), n_datasets=ensemble_size)):
                    if verbose > 1:
                        print('')
                    if model_name in ['decision tree', 'both']:
                        clf = DecisionTreeClassifierCustom()
                        clf.fit(sample, y_train)
                        ensemble.append(clf)
                        if verbose > 1:
                            print("Trained Classifer %s with score %1.4f" % ('Decision Tree', clf.score(X_test, y_test)))

                    if model_name in ['naive bayes', 'both']:
                        nb = GenericNB()
                        nb.fit(sample,y_train)
                        ensemble.append(nb)
                        if verbose > 1:
                            print("Trained Classifer %s with score %1.4f" % ('Naive Bayes', nb.score(X_test, y_test)))
                

            train_preds = get_ensemble_binary_classification(ensemble, X_train)
            test_preds = get_ensemble_binary_classification(ensemble, X_test)
            train_acc = accuracy_score(y_train, train_preds)
            test_acc = accuracy_score(y_test, test_preds)
            if verbose > 0:
                print("Fold Ensemble Scores train: %1.4f - test: %1.4f" % (train_acc, test_acc))
            train_errs.append(train_acc)
            test_errs.append(test_acc)
            ensemble_2 = np.concatenate([ensemble_2, ensemble])

        if save_name != None:
            file = os.getcwd()
            file += '/saves/' + save_name + '.pickle'
            with open(file, 'wb') as save_file:
                save_file.write(pickle.dumps(ensemble_2))
            if verbose > 0:
                print('Ensemble saved.')
        
        if verbose > -1:
            test_preds = get_ensemble_binary_classification(ensemble_2, X_test)
            test_acc = accuracy_score(y_test, test_preds)
            print("Cross-fold Ensemble Acc: ", test_acc)
            print('Avg. Ensemble accuracy for train data:', np.mean(train_errs))
            print('Avg. Ensemble accuracy for test data:', np.mean(test_errs))  
    

        

In [200]:
print(os.getcwd())

/var/www/Repositories/cse415/project


In [201]:
main(dataset_name='finance', model_name='both', verbose=1, bootstrap=True, ensemble_size=9, folds=10, save_name='finacetest')

(200, 37)
Constructing new ensemble

Running Fold 1
Bootstrapping...
Fold Ensemble Scores train: 0.5667 - test: 0.6000

Running Fold 2
Bootstrapping...
Fold Ensemble Scores train: 0.5778 - test: 0.5000

Running Fold 3
Bootstrapping...
Fold Ensemble Scores train: 0.5667 - test: 0.6000

Running Fold 4
Bootstrapping...
Fold Ensemble Scores train: 0.6111 - test: 0.2000

Running Fold 5
Bootstrapping...
Fold Ensemble Scores train: 0.5611 - test: 0.6500

Running Fold 6
Bootstrapping...
Fold Ensemble Scores train: 0.5556 - test: 0.7000

Running Fold 7
Bootstrapping...
Fold Ensemble Scores train: 0.5667 - test: 0.6000

Running Fold 8
Bootstrapping...
Fold Ensemble Scores train: 0.5611 - test: 0.6500

Running Fold 9
Bootstrapping...
Fold Ensemble Scores train: 0.5722 - test: 0.5500

Running Fold 10
Bootstrapping...
Fold Ensemble Scores train: 0.5611 - test: 0.6500
Ensemble saved.
Cross-fold Ensemble Acc:  0.65
Avg. Ensemble accuracy for train data: 0.57
Avg. Ensemble accuracy for test data: 0.57

In [192]:
program_kwargs = dict(
    dataset_name = None,
    model_name = None,
    ensemble_size = None,
    folds = None,
    bootstrap = None,
    save_name = None,
    load_name = None,
    verbose = None
)
valid_datasets = ['iris', 'finance']
while program_kwargs['dataset_name'] not in valid_datasets:
    program_kwargs['dataset_name'] = str(input("Which dataset would you like to use? ('finance' or 'iris'): ")).lower()
    if program_kwargs['dataset_name'] not in valid_datasets:
        print("Sorry I didn't recognize that dataset.")
        
valid_models = ['naive bayes', 'decision tree', 'both']
while program_kwargs['model_name'] not in valid_models:
    program_kwargs['model_name'] = str(input("Which model would you like to use? ('naive bayes', 'decision tree', 'both'): ")).lower()
    if program_kwargs['model_name'] not in valid_models:
        print("Sorry I didn't recognize that model.")
        
       
valid_bag_opts = ['y', 'n']
while program_kwargs['bootstrap'] not in valid_bag_opts:
    program_kwargs['bootstrap'] = str(input("Should we use Bootstrap Aggregating (Bagging)? (y/n): ")).lower()
    if program_kwargs['bootstrap'] not in valid_bag_opts:
        print("Sorry I didn't recognize that answer. (y/n)")
    else:
        program_kwargs['bootstrap'] = True if program_kwargs['bootstrap'] == 'y' else False
        break
        
if program_kwargs['bootstrap']:
    while type(program_kwargs['ensemble_size']) != type(1):
        try:
            program_kwargs['ensemble_size'] = input("What size ensemble would you like to use? (int): ")
            program_kwargs['ensemble_size'] = int(program_kwargs['ensemble_size'])
        except:
            print("Sorry I didn't recognize that number.")
            program_kwargs['ensemble_size'] = None
            
while type(program_kwargs['folds']) != type(1):
        try:
            program_kwargs['folds'] = input("How many folds would you like to use? (int): ")
            program_kwargs['folds'] = int(program_kwargs['folds'])
        except:
            print("Sorry I didn't recognize that number.")
            program_kwargs['folds'] = None
            
save_opt, load_opt = None, None
valid_opts = ['y', 'n']
while save_opt not in valid_opts:
    save_opt = str(input("Should we save the ensemble? (y/n): ")).lower()
    if save_opt not in valid_opts:
        print("Sorry I didn't recognize that answer. (y/n)")
    else:
        save_opt = True if save_opt == 'y' else False
        break

if save_opt:
    program_kwargs['save_name'] = input("Enter filename?: ")
    
while load_opt not in valid_opts:
    load_opt = str(input("Should we attempt to load an ensemble? (y/n): ")).lower()
    if load_opt not in valid_opts:
        print("Sorry I didn't recognize that answer. (y/n)")
    else:
        load_opt = True if load_opt == 'y' else False
        break

if load_opt:
    program_kwargs['load_name'] = input("Enter filename?: ")
    
       
while type(program_kwargs['verbose']) != type(1):
    try:
        program_kwargs['verbose'] = input("How verbose should I be? (int, -1 (silent) - 2 (Everything)): ")
        program_kwargs['verbose'] = int(program_kwargs['verbose'])
    except:
        print("Sorry I didn't recognize that number.")
        program_kwargs['verbose'] = None

print()
# print('Options')
# print(program_kwargs)

main(**program_kwargs)

Which dataset would you like to use? ('finance' or 'iris'): finance
Which model would you like to use? ('naive bayes', 'decision tree', 'both'): both
Should we use Bootstrap Aggregating (Bagging)? (y/n): y
What size ensemble would you like to use? (int): 10
How many folds would you like to use? (int): 10
Should we save the ensemble? (y/n): t
Sorry I didn't recognize that answer. (y/n)
Should we save the ensemble? (y/n): y
Enter filename?: finance1
Should we attempt to load an ensemble? (y/n): n
How verbose should I be? (int, -1 (silent) - 2 (Everything)): 1

Constructing new ensemble

Running Fold 1
Bootstrapping...


KeyboardInterrupt: 