In [1]:
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor
from cross_validate import CrossValidation
import matplotlib.pyplot as plt
import numpy as np
import pickle
%matplotlib inline

# Seed the random number generator:
np.random.seed(1)

In [2]:
# Load data
x_train = pickle.load(open("x_train.p", "rb"))
y_train = pickle.load(open("y_train.p", "rb"))
x_test = pickle.load(open("x_test.p", "rb"))
x_train_1 = np.load(open("lasso_data/x_train_lasso_1.p", "rb"))
x_train_2 = np.load(open("lasso_data/x_train_lasso_2.p", "rb"))
x_train_3 = np.load(open("lasso_data/x_train_lasso_3.p", "rb"))
x_train_4 = np.load(open("lasso_data/x_train_lasso_4.p", "rb"))
x_train_5 = np.load(open("lasso_data/x_train_lasso_5.p", "rb"))
labels = pickle.load(open("word_labels.p", "rb"))

In [3]:
def ada_suite(n_clfs, k_fold_cross_validation, x_data, y_data):
    accuracies = []
    print("Running gradient boosted decision tree with " 
              + str(n_clfs) + " weak regressors")
    print("and fold cross validation of: " + str(k_fold_cross_validation))
    cross_validated_data = CrossValidation(x_data, y_data, k_fold_cross_validation)
    x_train, y_train = cross_validated_data.get_other_partitions(0)
    x_test, y_test = cross_validated_data.get_partition(0)
    clf = AdaBoostClassifier(n_estimators=n_clfs)
    clf.fit(x_train, y_train)
    accuracies.append(clf.score(x_test, y_test))
    average_accuracy = np.mean(accuracies)
    print("Average accuracy was " + str(average_accuracy))
    
    return average_accuracy

def gb_suite(n_clfs, k_fold_cross_validation, x_data, y_data):
    accuracies = []
    print("Running gradient boosted decision tree with " 
              + str(n_clfs) + " weak regressors")
    print("and fold cross validation of: " + str(k_fold_cross_validation))
    cross_validated_data = CrossValidation(x_data, y_data, k_fold_cross_validation)
    x_train, y_train = cross_validated_data.get_other_partitions(0)
    x_test, y_test = cross_validated_data.get_partition(0)
    clf = GradientBoostingClassifier(n_estimators=n_clfs)
    clf.fit(x_train, y_train)
    accuracies.append(clf.score(x_test, y_test))
    average_accuracy = np.mean(accuracies)
    print("Average accuracy was " + str(average_accuracy))
    
    return average_accuracy

In [4]:
acc = ada_suite(1000, 10, x_train_2, y_train)

Running gradient boosted decision tree with 1000 weak regressors
and fold cross validation of: 10
Average accuracy was 0.8335

Average score: 0.8335


In [5]:
acc = ada_suite(2000, 10, x_train_3, y_train)

Running gradient boosted decision tree with 2000 weak regressors
and fold cross validation of: 10
Average accuracy was 0.817


In [6]:
acc = gb_suite(1000, 10, x_train_2, y_train)

Running gradient boosted decision tree with 1000 weak regressors
and fold cross validation of: 10
Average accuracy was 0.8455


In [None]:
acc = gb_suite(2000, 10, x_train_2, y_train)

Running gradient boosted decision tree with 2000 weak regressors
and fold cross validation of: 10
