**CS 4641 Project 1: Anish Moorthy**

In [None]:
# INSTALLING REQUIRED PACKAGES
!pip install -r requirements.txt

**Initial Setup**: Here I simply import modules 

In [None]:
import keras
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
import time
import sklearn
import matplotlib.pyplot as plt
import scipy
import itertools
try:
    from google.colab import files
except ImportError:
  print("Not running on colab")

# Hopefully memory is less of an issue with colab, but I'll keep this in case
NP_DATA_TYPE = np.float64

dataset = None
CIFAR = True
IMDB = False

itermode = None
ITER_RETRAIN = 1
ITER_BOOST = 2
ITER_NONE = 3

lsource = None
SKLEARN = 1
KERAS = 2

Here I define some modular functions which will implement learning processes. Note that these functions allow incoming data to be transformed in various ways (such as normalization), and that this normalization is performed ONLY on the part of the data used for training: thus information from the test set does not leak into our training process, and the test/validation sets are transformed later according to the scalers' processing of the training data

In [None]:
def train_with_crossval(LearnerType, LearnerParams,
                        ProcessorType, ProcessorArgs,
                        training_data, training_labels, 
                        val_fraction, num_evals,
                        verbose=True):

    # I needed a bit more flexibility than what the default cross-validation
    # function provided, so I implement my own here. This function creates
    # multiple train-val splits and trains the learner (given params) on each
    # of them, returning an array of (learner, preprocessor, train_acc, val_acc)
    # tuples
    
    # Note that it's not "traditional" cross-validation in that the data is 
    # split into folds and each fold is used: rather the creation of splits is
    # random
    
    # score_function should produce some sort of score(true_params, predicted_params)
    
    
    score_function = metrics.accuracy_score
    results = [None] * num_evals
    
    for i in range(num_evals):
        foobar = train_test_split(training_data, training_labels, 
                               test_size=val_fraction, shuffle=True)
        tmp_train_data, tmp_val_data, tmp_train_labels, tmp_val_labels = foobar
      
        scaler = ProcessorType(**ProcessorArgs)
        scaler.fit(tmp_train_data)
        tmp_train_data = scaler.transform(tmp_train_data)
        tmp_val_data = scaler.transform(tmp_val_data)
       
        start_time = time.time()
        learner = LearnerType(**LearnerParams)
        learner.fit(tmp_train_data, tmp_train_labels)
        end_time = time.time()
        train_time = end_time - start_time
      
        train_score = score_function(learner.predict(tmp_train_data),
                                  tmp_train_labels)
        val_score = score_function(learner.predict(tmp_val_data),
                                    tmp_val_labels)
        
        if verbose:
            print("Validator ", i, " finished in ", train_time,"s"
                  + "with (train, val) scores of ", (train_score, val_score))
      
        results[i] = (learner, scaler, train_score, val_score)
    
    if verbose:
        print("All validations complete.")

    return results
   
def perf_over_iterations_nonincremental(LearnerType, hyperparams, 
                                        ProcessorType, ProcessorArgs,
                                        train_examples, test_examples,
                                        iterations_array):
    # Iterations_array: an array of integers to train the model for before
    # evaluating its performance
    
    score = metrics.accuracy_score
    
    print("WARNINING... You are running the non-incremental evaluator, " \
         + "which will probably be very slow. You know what you're getting into, right?")
    
    train_x, train_y = train_examples
    test_x, test_y = test_examples
    
    iterations_nparray = np.array(iterations_array)
    train_accuracy_array = np.zeros(len(iterations_array))
    test_accuracy_array = np.zeros(len(iterations_array))
    
    for index, num_iters in enumerate(iterations_array):
        
        hyperparams_with_iter = hyperparams.copy()
        hyperparams_with_iter["max_iter"] = num_iters
        learner, scaler, train_acc, _ = train_with_crossval(LearnerType, 
                                                            hyperparams_with_iter,
                                                            ProcessorType, ProcessorArgs,
                                                            train_examples[0], 
                                                            train_examples[1], 
                                                            .01, 1)[0]
        
        train_accuracy_array[index] = train_acc
        test_accuracy_array[index] = score(learner.predict(scaler.transform(test_x)), 
                                           test_y)
        
    return (iterations_nparray, train_accuracy_array, test_accuracy_array)
  
def perf_over_iterations_boost(learner, scaler,
                               train_examples, test_examples,
                              verbose=True):
    
    train_x, train_y = train_examples
    test_x, test_y = test_examples
    
    scaled_train_x = scaler.transform(train_x)
    scaled_test_x = scaler.transform(test_x)
    
    train_accs = []
    test_accs = []
    
    train_claccs = []
    test_claccs = []
    
    for train_pred in learner.staged_predict(scaled_train_x):
        train_accs.append(sklearn.metrics.accuracy_score(train_pred, train_y))
        train_claccs.append(get_confusion_and_class_accs(train_y, train_pred)[1])
        
    for test_pred in learner.staged_predict(scaled_test_x):
        test_accs.append(sklearn.metrics.accuracy_score(test_pred, test_y))
        test_claccs.append(get_confusion_and_class_accs(test_y, test_pred)[1])
       
    if verbose:
        print("Train class accuracies over iterations")
        print(np.array(train_claccs))
        print("Test class accuracies over iterations")
        print(np.array(test_claccs))
    
    return np.arange(len(train_accs)), np.array(train_accs), np.array(test_accs)
  
def flatten_data_array(X):
    # Given an array X where X[i] is the ith data tensor, return an array X'
    # where X'[i] is the flattened data of the ith data tensor
    num_datapoints = X.shape[0]
    datum_size = np.prod(X.shape[1:])
    return X.reshape([num_datapoints, datum_size])
  
def augmented_split(*args, test_size=None, shuffle=True, random_state=None):
  
    if test_size < 0 or test_size >= 1:
        raise RuntimeError("Cant handle test fraction of ", str(test_size))
    
    if test_size == 0:
        out = [(a, None) for a in args]
        return itertools.chain(*out)
    else:
      return train_test_split(*args, 
                              test_size=test_size, 
                              shuffle=shuffle,
                              random_state=random_state)

def get_confusion_and_class_accs(truth_labels, predictions):

    confusion_mat = sklearn.metrics.confusion_matrix(truth_labels, predictions)
    # The counts are always in increasing order of label, which is good!
    unique, counts = np.unique(truth_labels, return_counts=True)
    
    # https://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
    percent_confusion = (confusion_mat / counts[:,None]).astype(np.float16)
    class_percents = np.diagonal(percent_confusion)
    
    return percent_confusion, class_percents

## **DATASET SELECTION**: 
Only run one of the cells below. The first will load the CIFAR-10 dataset. The second will load the IMDB sentiment analysis dataset. If both are run, only the most recently-run dataset will be used.


In [None]:
# CIFAR DATASET
from skimage.color import rgb2gray

dataset = CIFAR

# Configurable flags
CIFAR_GRAYSCALE = True

if not CIFAR_GRAYSCALE:
  print("NOT GRAYSCALING PICTURES!")

# NOTE: Don't worry about the train and test sets being merged below: we will 
# split the data into train and testing sets again before training

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
x_all, labels = np.concatenate([x_train, x_test]), np.concatenate([y_train, y_test])
labels = labels.ravel()

# NOTE: I preprocess the CIFAR-10 dataset in a few ways. First, the pictures
# are converted to grayscale images (greatly reducing dimensionality/complexity)
# if the relevant flag is set

if CIFAR_GRAYSCALE:
    x_all = rgb2gray(x_all).astype(NP_DATA_TYPE, copy=False)

x_all = flatten_data_array(x_all)

print("Full Data, Label shapes = ", x_all.shape, ", ", labels.shape)

In [None]:
# IMDB DATASET
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
import os.path

dataset = IMDB

# Changeable parameter
MAX_WORD_FEATURES = 10000
ONEHOTS_FILENAME = "imdb-onehots.gz"
LABELS_FILENAME = "imdb-labels.gz"

def clean_text(raw_review):
    # Function to convert a raw review to a string of words
    
    # Import modules
    from bs4 import BeautifulSoup
    import re
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text() # Remove HTML
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) # Remove non-letters 
    words = letters_only.lower().split() # Convert to lower case, split into individual words
    stops = set(stopwords.words("english")) # Remove stop words (use of sets makes this faster)               
    meaningful_words = [w for w in words if not w in stops]                             
    porter = PorterStemmer() # Reduce word to stem of word
    stemmed_words = [porter.stem(w) for w in meaningful_words]
    joined_words = ( " ".join( stemmed_words )) # Join the words back into one string separated by space
    return joined_words 

def apply_cleaning_function_to_series(X):
    print('Cleaning data')
    start_time = time.time()
    cleaned_X = []
    for element in X:
        cleaned_X.append(clean_text(element))
    print ('Finished in ', str((time.time() - start_time)/60), " minutes")
    return cleaned_X

if not os.path.isfile(ONEHOTS_FILENAME + ".npz"):
    nltk.download('stopwords')
    print("one-hots not created yet: cleaning and saving to file")
    print("Expect this to take about 10-15 minutes")
    data = pd.read_csv('https://gitlab.com/michaelallen1966/00_python_snippets_and_recipes/raw/master/machine_learning/data/IMDb.csv')
    
    x_cleaned = apply_cleaning_function_to_series(data["review"])
    labels = np.array(data["sentiment"]).ravel()
    
    # Free up memory!
    data = None
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 ngram_range=(1,1),
                                 max_features=MAX_WORD_FEATURES)
    vectorizer.fit(x_cleaned)
    x_all = vectorizer.transform(x_cleaned)
    x_cleaned = None
    scipy.sparse.save_npz(ONEHOTS_FILENAME, x_all)
    np.savetxt(LABELS_FILENAME, labels)
else:
    print("loading one-hots from file")
    start_time = time.time()
    x_all = scipy.sparse.load_npz(ONEHOTS_FILENAME + ".npz")
    labels = np.loadtxt(LABELS_FILENAME)
    end_time = time.time()
    print("Finished loading one-hots in ", (end_time - start_time)/60, " minutes")

print("Full Data, Label shapes = ", x_all.shape, ", ", labels.shape)

# files.download(ONEHOTS_FILENAME + ".npz")
# files.download(LABELS_FILENAME)
# files.upload()

## **Choosing Learner**
Here I define various types of learners and sets of hyperparameters which I would like to test them with. Run whichever setup corresponds to the learner you would like to test. Each also defines parameters such as train/val/test fractions and how the data should be normalized

Each cell defines a variable named hyperparam_configs which is a list of dictionaries containing hyperparameter names and corresponding values (the creation location is based on which dataset is selected, so make sure you're looking at the right spot!) If you would like to test a single set of hyperparameters, just comment out/delete all but one element of the list

In [None]:
# SCIKIT DECISION TREE

from sklearn.tree import DecisionTreeClassifier as NoPruningDecisionTree

LearnerType = NoPruningDecisionTree
itermode = ITER_NONE
lsource = SKLEARN

if dataset == CIFAR:

    NUM_VALIDATIONS = 5
    FRACTION_OF_DATASET = 1
    TEST_OVER_TOTAL = 0.2
    VAL_OVER_TRAIN = 0.2
    
    ProcessorType = sklearn.preprocessing.StandardScaler
    ProcessorArgs = {"with_mean": True, "with_std": True}
         
    hyperparam_configs = [
        {"criterion": "entropy", "max_depth": 5, "min_samples_split": 4, "max_leaf_nodes": None},
        {"criterion": "entropy", "max_depth": None, "min_samples_split": 2, "max_leaf_nodes": 100},
    ]
    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .3, .4, .5, .6, .7, .8]
    
elif dataset == IMDB:
  
    NUM_VALIDATIONS = 10
    FRACTION_OF_DATASET = 1
    TEST_OVER_TOTAL = .2
    VAL_OVER_TRAIN = .2 
  
    ProcessorType = sklearn.feature_extraction.text.TfidfTransformer
    ProcessorArgs = {"use_idf": True, "sublinear_tf": True}
    
    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .3, .4, .5, .6, .7, .8]
    
    hyperparam_configs = [
        {"criterion": "entropy", "max_depth": 1, "min_samples_split": 2, "max_leaf_nodes": None},
        {"criterion": "entropy", "max_depth": 5, "min_samples_split": 4, "max_leaf_nodes": None},
        {"criterion": "entropy", "max_depth": None, "min_samples_split": 2, "max_leaf_nodes": 100},
        {"criterion": "entropy", "max_depth": None, "min_samples_split": 2, "max_leaf_nodes": None},
        {"criterion": "entropy", "max_depth": 6, "min_samples_split": 50, "max_leaf_nodes": None},
    ]

In [None]:
# SUPPORT VECTOR MACHINES

from sklearn.svm import SVC as SupportVectorClassifier

LearnerType = SupportVectorClassifier
itermode = ITER_RETRAIN
lsource = SKLEARN


if dataset == CIFAR:

    FRACTION_OF_DATASET = 1
    TEST_OVER_TOTAL = 0.5
    VAL_OVER_TRAIN = 0.25
    NUM_VALIDATIONS = 1

    ProcessorType = sklearn.preprocessing.StandardScaler
    ProcessorArgs = {"with_mean": True, "with_std": True}
    
    PERF_ITERATIONS_ARRAY = [0, 500, 1000, 2000, 3000]
    PERF_TRAINSIZE_ARRAY = [.1, .2, .4, .6, .8]
    
    hyperparam_configs = [
        {"C": 1.0, "kernel": "rbf", "degree": 0, "shrinking": True, "max_iter": 2000},
    ]
    
elif dataset == IMDB:

  # TODO Run this and see whether same kind of results hold
  # TODO used to be .8
    FRACTION_OF_DATASET = 1
    TEST_OVER_TOTAL = .2
    VAL_OVER_TRAIN = .2
    NUM_VALIDATIONS = 5
    
    hyperparam_configs = [
        {"C": 1.0, "kernel": "linear", "degree": 0, "shrinking": True, "max_iter": 2000},
    ]
  
    ProcessorType = sklearn.feature_extraction.text.TfidfTransformer
    ProcessorArgs = {"use_idf": True, "sublinear_tf": True}

    PERF_ITERATIONS_ARRAY = [1, 50, 100, 200, 500, 750, 1000, 2000, 3000]
    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .3, 4, .5, .6, .7, .8]


In [None]:
# K NEAREST NEIGHBOURS

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_distances as cosdist

LearnerType = KNeighborsClassifier
itermode = ITER_NONE
lsource = SKLEARN

if dataset == CIFAR:

    FRACTION_OF_DATASET = .2
    NUM_VALIDATIONS = 5
    TEST_OVER_TOTAL = 0.2
    VAL_OVER_TRAIN = 0.2
    
    ProcessorType = sklearn.preprocessing.StandardScaler
    ProcessorArgs = {"with_mean": True, "with_std": True}
         
    hyperparam_configs = [
        # RUN 0
        # {"n_neighbors": 1, "p": 2, "metric": "minkowski", "weights": "uniform"},
        # {"n_neighbors": 5, "p": 2, "metric": "minkowski", "weights": "uniform"},
        # {"n_neighbors": 3, "p": 1, "metric": "minkowski", "weights": "uniform"},
        
        # RUN 1
        #{"n_neighbors": 3, "p": 1, "metric": "minkowski", "weights": "distance"},
        # {"n_neighbors": 7, "p": 2, "metric": "minkowski", "weights": "distance"}, # Takes an hour to run for some reason?
        {"n_neighbors": 20, "p": 1, "metric": "minkowski", "weights": "distance"},
        # {"n_neighbors": 8, "p": 5, "metric": "minkowski", "weights": "uniform"}, # ALSO takes an hour!?!
    ]
    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .4, .6, .8]
    
elif dataset == IMDB:
  
    FRACTION_OF_DATASET = .15
    NUM_VALIDATIONS = 5
    TEST_OVER_TOTAL = .2
    VAL_OVER_TRAIN = .2 
    ProcessorType = sklearn.feature_extraction.text.TfidfTransformer
    ProcessorArgs = {"use_idf": True, "sublinear_tf": True}
    
    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .3, .4, .5, .6, .7, .8]
    
    hyperparam_configs = [ 
        # {"n_neighbors": 3, "p": 1, "metric": "minkowski", "weights": "distance"},
        # {"n_neighbors": 10, "p": 1, "metric": "minkowski", "weights": "distance"},
        # {"n_neighbors": 15, "p": 1, "metric": "minkowski", "weights": "distance"},
        {"n_neighbors": 1, "p": 2, "metric": "minkowski", "weights": "uniform"},
        {"n_neighbors": 5, "p": 2, "metric": "minkowski", "weights": "uniform"},
        # {"n_neighbors": 3, "p": 1, "metric": "minkowski", "weights": "uniform"},     
    ]

In [None]:
# BOOSTING

from sklearn.tree import DecisionTreeClassifier as NoPruningDecisionTree
from sklearn.ensemble import AdaBoostClassifier

LearnerType = AdaBoostClassifier
itermode = ITER_BOOST
lsource = SKLEARN

if dataset == CIFAR:

    NUM_VALIDATIONS = 1
    FRACTION_OF_DATASET = 1
    TEST_OVER_TOTAL = 0.2
    VAL_OVER_TRAIN = 0.2
    
    ProcessorType = sklearn.preprocessing.StandardScaler
    ProcessorArgs = {"with_mean": True, "with_std": True}
         
    weak_learner = NoPruningDecisionTree(criterion="entropy",
                                         max_depth=5, 
                                         min_samples_split=4)
    hyperparam_configs = [
        {"base_estimator": weak_learner, "n_estimators": 20, "learning_rate": 1},
        # {"base_estimator": weak_learner, "n_estimators": 5, "learning_rate": 2},
        # {"base_estimator": weak_learner, "n_estimators": 5, "learning_rate": 5},
        # {"base_estimator": None, "n_estimators": 50, "learning_rate": 1},
        # {"base_estimator": None, "n_estimators": 5, "learning_rate": .5},
        
        # {"base_estimator": weak_learner, "n_estimators": 50, "learning_rate": 1}, # Default params
        # {"base_estimator": weak_learner, "n_estimators": 25, "learning_rate": 2},
        # {"base_estimator": weak_learner, "n_estimators": 5, "learning_rate": 5},
        # {"base_estimator": weak_learner, "n_estimators": 5, "learning_rate": .5},
    ]
    

    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .3, .4, .5, .6, .7, .8]
    PERF_ITERATIONS_ARRAY = None
    
elif dataset == IMDB:
  
    NUM_VALIDATIONS = 1
    FRACTION_OF_DATASET = 1
    TEST_OVER_TOTAL = .2
    VAL_OVER_TRAIN = .2 
  
    ProcessorType = sklearn.feature_extraction.text.TfidfTransformer
    ProcessorArgs = {"use_idf": True, "sublinear_tf": True}
    
    PERF_TRAINSIZE_ARRAY = [.02, .1, .2, .3, .4, .5, .6, .7, .8]
    PERF_ITERATIONS_ARRAY = None
    
    weak_learner = NoPruningDecisionTree(criterion="entropy",
                                         max_depth=None, 
                                         min_samples_split=2,
                                        max_leaf_nodes=100)
    
    hyperparam_configs = [
        {"base_estimator": weak_learner, "n_estimators": 30, "learning_rate": 1},
        # {"base_estimator": weak_learner, "n_estimators": 10, "learning_rate": 2},
        # {"base_estimator": None, "n_estimators": 5, "learning_rate": 1},
        # {"base_estimator": None, "n_estimators": 5, "learning_rate": .5},
        # {"base_estimator": None, "n_estimators": 5, "learning_rate": 2},
    ]

Now to split our data into testing and training sets so that we can do some learning!

In [None]:
# Split data into test/test sets
x_reduced, _, labels_reduced, _ = augmented_split(x_all, labels, 
                                       test_size=1-FRACTION_OF_DATASET)
print("Data and label shapes are: ", x_reduced.shape, labels_reduced.shape)
train_data, test_data, \
    train_labels, test_labels = train_test_split(x_reduced, labels_reduced,
                                                 test_size=TEST_OVER_TOTAL)  

And finally I attempt to actually learn something :|

In [None]:
print(LearnerType.__name__ + "\n-----------------------------------")
print("Fraction of Dataset: ", FRACTION_OF_DATASET)
print("Test over total", TEST_OVER_TOTAL)
print("Train over val", VAL_OVER_TRAIN)
print("Preprocessor type, args", ProcessorType.__name__," ", ProcessorArgs)
print("Num validations: ", NUM_VALIDATIONS)

learners = [None] * len(hyperparam_configs)
accuracies = [None] * len(hyperparam_configs)

for index, hyperparams in enumerate(hyperparam_configs):
  
    print("\n========================================================")
    print(LearnerType.__name__ + " " + str(index))
    print("Hyperparams ", hyperparams)

    start_time = time.time()
    results = train_with_crossval(LearnerType=LearnerType, 
                                  LearnerParams=hyperparams, 
                                  ProcessorType=ProcessorType,
                                  ProcessorArgs=ProcessorArgs,
                                  training_data=train_data, 
                                  training_labels=train_labels,
                                  val_fraction=VAL_OVER_TRAIN, 
                                  num_evals=NUM_VALIDATIONS,
                                  verbose=True)
    end_time = time.time()
    print("--------------------------------------------------------")
    print("Total time spent training is " + str(end_time - start_time) + " seconds")
    
    learners_and_transformers = [r[:2] for r in results]
    train_accuracies = np.array([r[2] for r in results])
    val_accuracies = np.array([r[3] for r in results])
    
    mean_train_accuracy, mean_train_stdv = np.mean(train_accuracies), np.std(train_accuracies)
    mean_val_accuracy, mean_val_stdv = np.mean(val_accuracies), np.std(val_accuracies)
    
    learners[index] = learners_and_transformers
    accuracies[index] = (mean_train_accuracy, mean_val_accuracy)
    stdvs = (mean_train_stdv, mean_val_stdv)
    
    print("Standard deviation of (train-acc, val-acc) = ", stdvs)
    print("~~~~~~~> (train-acc, val-acc) = ", accuracies[index])

**Test Set Evaluation:** Here we choose the set of hyperparameters which has the best validation accuracy, and evaluate it on the test set

In [None]:
if lsource == SKLEARN:
    # best_hyperparams = hyperparam_configs[0]
    best_learner_index = np.argmax([acc[1] for acc in accuracies])
    best_hyperparams = hyperparam_configs[best_learner_index]
    # Recall that we return an array of learners for each validation run, so take the first
    best_learner_and_scaler = learners[best_learner_index][0]
    best_learner, best_scaler = best_learner_and_scaler

    print("The best-performing series of learners was " + str(LearnerType.__name__) \
          + str(best_learner_index))
    print("Hyperparameters for this learner were ", best_hyperparams)

    test_predictions = best_learner.predict(best_scaler.transform(test_data))
    test_accuracy = metrics.accuracy_score(test_predictions, test_labels)
    print("On the test set, accuracy is: ", test_accuracy)

    confusion, class_accs = get_confusion_and_class_accs(test_labels, test_predictions)

    np.set_printoptions(precision=3)
    print("Confusion Matrix: ")
    print(confusion)
    print("Class Accuracies: ", class_accs)

**Performance over dataset size** : Taking several *fractions of the static training set* as training sets, train the learner (using the best hyperparams) on each set and report performance on the training/test sets

In [None]:
if lsource == SKLEARN:
    perf_fracs = np.array(PERF_TRAINSIZE_ARRAY)
    perf_trainfracs_accs = np.zeros(len(PERF_TRAINSIZE_ARRAY))
    perf_testfracs_accs = np.zeros(len(PERF_TRAINSIZE_ARRAY))

    for index, train_fraction in enumerate(PERF_TRAINSIZE_ARRAY):

        start_time = time.time()
        perf_trainfracs_data, _, \
            perf_trainfracs_labels, __ = train_test_split(train_data, train_labels,
                                                         test_size=1-train_fraction)

        learner, scaler, train_acc, _ = train_with_crossval(LearnerType, 
                                                            best_hyperparams,
                                                            ProcessorType,
                                                            ProcessorArgs,
                                                            perf_trainfracs_data,
                                                            perf_trainfracs_labels,
                                                            .01, 1, verbose=False)[0]

        perf_trainfracs_accs[index] = train_acc
        test_predictions = learner.predict(scaler.transform(test_data))
        perf_testfracs_accs[index] = metrics.accuracy_score(test_predictions,
                                                             test_labels)

        end_time = time.time()
        print("\n===============================================================")
        print("Fraction " + str(train_fraction) +": " + str(index) + " finished in " + str((end_time - start_time)/60) + " mins")
        print("Train/Test Accuracies: ", (train_acc, perf_testfracs_accs[index]))
        confusion, class_accs = get_confusion_and_class_accs(test_labels, test_predictions)
        print("Class accuracies: ", class_accs)

    print(perf_trainfracs_accs)
    print(perf_testfracs_accs)

In [None]:
if lsource == SKLEARN:
    # Graphing performance over time...
    plt.title("Accuracy vs Training Size")
    plt.xlabel('Percentage of data used for training')
    plt.ylabel('Accuracy (%)')
    print(perf_fracs)
    print(perf_trainfracs_accs)
    print(perf_testfracs_accs)
    plt.ylim(ymin=0)
    plt.plot(perf_fracs, perf_trainfracs_accs, label="Train data") 
    plt.plot(perf_fracs, perf_testfracs_accs, label="Test data")
    plt.ylim(ymin=0)

    plt.show()

**Performance over time(iterations)**: Using *the same training/test sets every time*, calculate the performance on (train/test) data as a function of the number of iterations

In [None]:
if lsource == SKLEARN:
    # best_hyperparams = hyperparam_configs[0]
    start_time = time.time()
    if itermode == ITER_RETRAIN:
        # Training function (might take a lot of time)
        start_time = time.time()
        perf_iters, perf_trainits_accs, \
                    perf_testits_accs = perf_over_iterations_nonincremental(LearnerType=LearnerType,
                                                                            hyperparams=best_hyperparams,
                                                                            ProcessorType=ProcessorType,
                                                                            ProcessorArgs=ProcessorArgs,
                                                                            train_examples=(train_data, train_labels),
                                                                            test_examples=(test_data, test_labels),
                                                                            iterations_array=PERF_ITERATIONS_ARRAY,)
    elif itermode == ITER_NONE:
      print("Whaddaya doing!? Can't iterate over this learner ya goof")

    elif itermode == ITER_BOOST:
        perf_iters, perf_trainits_accs, \
                    perf_testits_accs = perf_over_iterations_boost(learner=best_learner,
                                                                  scaler=best_scaler,
                                                                  train_examples=(train_data, train_labels),
                                                                  test_examples=(test_data, test_labels))

    end_time = time.time()
    print("Iteration eval finished in ", str((end_time - start_time)/60), " mins")
    print("Train accs", perf_trainits_accs)
    print("Test accs", perf_testits_accs)

In [None]:
if lsource == SKLEARN:
    # Graphing performance over time...
    plt.title("Accuracy vs Number Iterations")
    plt.xlabel('Number of iterations')
    plt.ylabel('Accuracy (%)')
    # plt.ylim(ymin=0)
    plt.plot(perf_iters, perf_trainits_accs, label="Train data")
    plt.plot(perf_iters, perf_testits_accs, label="Test data")
    # plt.ylim(ymin=0)
    plt.show()