# Active Learning with SUSY-AI Pool
### Include packages

In [None]:
import copy

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

### Set Active Learning configuration parameters

In [None]:
# Size of initial dataset (will be chosen at random from remaining pool)
size_init = 10000
# Step size
size_iter = 2500
# Number of active learning iterations
n_iterations = 1
# Test size
size_test = 100000
# Stop at
size_max = -1

### Set other configuration variables

In [None]:
# Location of the data pool
data_location = "alldata_full.npy"
# Output log files
output_location_active = "active.csv"
output_location_random = "random.csv"

---
## Load data and logbook

In [None]:
source = np.load(data_location).astype(np.float)
X = source[:,1:21]
y = source[:,-6]

In [None]:
X -= np.mean(X, axis=0)
X /= np.std(X, axis=0)

In [None]:
log_active = open(output_location_active, 'w')
log_active.write("iteration,trainsize,score,accuracy,auc,brier,f1,precision,recall\n")
log_active.flush()

log_random = open(output_location_random, 'w')
log_random.write("iteration,trainsize,score,accuracy,auc,brier,f1,precision,recall\n")
log_random.flush()

---
## Functions
### Train classifier and create metrics

In [None]:
def log_test_results(log, prediction):
    log.write(','.join(map(str, prediction)))
    log.write("\n")
    log.flush()

In [None]:
def train_and_test(X_train, y_train, X_test, y_test, do_log=False):
    # Create and train algorithm
    est = RandomForestClassifier()
    est.fit(X_train, y_train)
    # Make prediction on test set
    y_pred = est.predict(X_test)
    # Calculate performance
    score = est.score(X_test, y_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred)
    brier = metrics.brier_score_loss(y_test, y_pred)
    fone = metrics.f1_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    # Return performance metrics
    return (est, {
        "score": score,
        "acc": accuracy,
        "auc": auc,
        "brier": brier,
        "f1": fone,
        "prec": precision,
        "recall": recall
    })

### Create initial data sets for each iteration

In [None]:
def create_sets(X, y, size_init, size_test):
    # Split data in test and pool
    X_pool, X_test, y_pool, y_test = train_test_split(X, y, test_size=size_test)
    # X_pool is now randomly ordered w.r.t. the original X, so we can just use
    # slicing to create the training and the pool set
    X_train = X_pool[:size_init, :]
    y_train = y_pool[:size_init]
    X_pool = X_pool[size_init:, :]
    y_pool = y_pool[size_init:]
    # Return arrays
    return (X_train, y_train, X_pool, y_pool, X_test, y_test)

### Log results

In [None]:
def log_results(log, iteration, size, performance):
    log.write("{},{},{},{},{},{},{},{},{}\n".format(
        iteration,
        size,
        performance["score"],
        performance["acc"],
        performance["auc"],
        performance["brier"],
        performance["f1"],
        performance["prec"],
        performance["recall"]
    ))
    log.flush()

In [None]:
def log_uncertainties(log, uncertainties):
    log.write(','.join(map(str, uncertainties))+"\n")
    log.flush()

In [None]:
def log_label_predtruth(log, prediction, truth):
    log.write(','.join(map(str, prediction))+',')
    log.write(','.join(map(str, truth)))
    log.write("\n")
    log.flush()

### Increase data size

In [None]:
def increment_dataset_random(X_train, y_train, X_pool, y_pool, size_iter):
    # Get random indices for selection from pool
    seed = np.random.rand(X_pool.shape[0])
    indices = np.argsort(seed)[::-1]
    # Increment data set
    X_train, y_train, X_pool, y_pool, y_selected = increment_dataset(indices, X_train, y_train, X_pool, y_pool, size_iter)
    # Result new training set and pool
    return (X_train, y_train, X_pool, y_pool)

def increment_dataset_active(est, X_train, y_train, X_pool, y_pool, size_iter, log_uncertainty):
    # Create prediction on pool to get uncertainty of estimator on pool data points
    y_pred = est.predict_proba(X_pool)[:,1]
    uncertainty = -1*np.abs(y_pred-0.5)+0.5
    # Sort pool based on predictions
    indices = uncertainty.argsort()[::-1]
    # Increment data set
    X_train, y_train, X_pool, y_pool, y_selected = increment_dataset(indices, X_train, y_train, X_pool, y_pool, size_iter)
    # Return new training set and pool
    return (X_train, y_train, X_pool, y_pool)

def increment_dataset(indices, X_train, y_train, X_pool, y_pool, size_iter):
    # Sort pool based on random indices
    X_pool = X_pool[indices,:]
    y_pool = y_pool[indices]
    # Selected
    X_selected = X_pool[:size_iter, :]
    y_selected = y_pool[:size_iter]
    # Add top [size_iter] points to training data
    X_train = np.vstack((X_train, X_selected))
    y_train = np.hstack((y_train, y_selected))
    # Remove selected points from the pool
    X_pool = X_pool[size_iter:, :]
    y_pool = y_pool[size_iter:]
    # Return training set and pool
    return (X_train, y_train, X_pool, y_pool, y_selected)

---
## Main program

In [None]:
for iteration in range(n_iterations):
    print("ITERATION {}".format(iteration))
    # Create initial data sets
    # Create data sets for active learning
    X_active, y_active, X_active_pool, y_active_pool, X_test, y_test = create_sets(X, y, size_init, size_test)
    # Copy initial states of active learning to create initial states for random sampling
    X_random, y_random = copy.deepcopy(X_active), copy.deepcopy(y_active)
    X_random_pool, y_random_pool = copy.deepcopy(X_active_pool), copy.deepcopy(y_active_pool)

    # Determine stopping criterion
    if size_max == -1:
        continue_run = X_active_pool.shape[0] >= size_iter
    else:
        continue_run = X_active_pool.shape[0] >= size_max
        
    # Run for as long
    while continue_run:
        """ Random Sampling """
        # Get performance of trained estimator
        _, performance = train_and_test(X_random, y_random, X_test, y_test, False)
        # Log results
        log_results(log_random, iteration, X_random.shape[0], performance)
        # Increment training datßa by adding data from the pool
        X_random, y_random, X_random_pool, y_random_pool = increment_dataset_random(X_random, y_random, X_random_pool, y_random_pool, size_iter)
        # Store result for printing
        result_random = performance["acc"]

        """ Active Learning """
        # Get performance of trained estimator
        estimator, performance = train_and_test(X_active, y_active, X_test, y_test, True)
        # Log results
        log_results(log_active, iteration, X_active.shape[0], performance)
        # Print results
        print("  {:<7}{:<10}{:<10}".format(X_active.shape[0], round(result_random, 5), round(performance["acc"],5)))
        # Increment training data by adding data from the pool
        X_active, y_active, X_active_pool, y_active_pool = increment_dataset_active(estimator, X_active, y_active, X_active_pool, y_active_pool, size_iter, None)

        # Determine stopping criterion
        if size_max == -1:
            continue_run = X_active_pool.shape[0] >= size_iter
        else:
            continue_run = X_active_pool.shape[0] >= size_max

---
## Load results

In [None]:
active = pd.read_csv(output_location_active)
random = pd.read_csv(output_location_random)

---
## Plot results

In [None]:
# initialise arrays
n_iterations = active['iteration'].value_counts().keys().max() + 1
n_per_iteration = int(active.shape[0] / n_iterations)
al = np.zeros((n_per_iteration, n_iterations))
rs = np.zeros((n_per_iteration, n_iterations))

# Fill arrays
for i in range(n_iterations):
    print(active[active['iteration']==i]['accuracy'].shape)
    al[:,i] = active[active['iteration']==i]['accuracy']
    rs[:,i] = random[random['iteration']==i]['accuracy']
    if i == 0:
        # Get x axis
        x = active[active['iteration'] == i]['trainsize']
        
# Plot lines and bands
plt.clf()
plt.figure(figsize=(16,10))
for label,arr in zip(('Active learning', 'Random sampling'),(al, rs)):
    plt.plot(x,np.mean(arr, axis=1), label=label)
    band_min = np.amin(arr, axis=1)
    band_max = np.amax(arr, axis=1)
    plt.fill_between(x,band_min, band_max, alpha=0.3)
plt.legend()
plt.xlim([0,309000])
plt.xlabel("Train size")
plt.ylabel("Accuracy")
plt.show()