# Infinite pool QBC

### Import packages
**Keras**

In [None]:
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Activation, Lambda
from keras.layers import Dropout
from keras.callbacks import History, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical
from keras import regularizers
from keras import initializers
import keras.backend as K

**Data Science**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.metrics as metrics
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
sns.set()

**Other**

In [None]:
from copy import deepcopy
from math import floor
import time

---
### Functions
**Plot Projection**

In [None]:
def plot_projection(data, classification, x=0, y=1, bins=100):
    # Calculate ranges and bins
    xmin, xmax = np.amin(data[:,x]), np.amax(data[:,x])
    ymin, ymax = np.amin(data[:,y]), np.amax(data[:,y])
    xbins = np.linspace(xmin, xmax, bins)
    ybins = np.linspace(ymin, ymax, bins)
    # Calculate two histograms
    allowed, _, _ = np.histogram2d(data[classification==1.0, x], data[classification==1.0, y], [xbins, ybins])
    excluded, _, _ = np.histogram2d(data[classification==0.0, x], data[classification==0.0, y], [xbins, ybins])
    # Calculate map
    mapping = allowed / (allowed + excluded)
    mapping = mapping.T
    mapping = np.flipud(mapping)
    # Plot
    f, a = plt.subplots(1,1,figsize=(8,8))
    a.matshow(mapping, extent=(xmin, xmax, ymin, ymax), cmap="seismic_r")
    print("({}, {})".format(np.amin(mapping),np.amax(mapping)))
    plt.show()

**Normalisation**

In [None]:
mins = np.array([-3999.99, -4000.0, 200.0, 90.879999, 91.419997, 90.02, 90.250001, 200.02999, 102.65, 200.00984, 264.01999, 200.0199, 100.06994, -7998.919, -3999.97, -3999.97, -4381.715, 1364.5547, 2.6724445])
maxs = np.array([3999.95, 3999.83, 3999.98, 4000.0, 3999.98, 3999.99, 4000.0, 3999.9899, 3999.9999, 3999.98, 3999.9999, 3999.97, 4000.0, 7993.83, 3999.95, 3999.94, 4132.4793, 37569812.0, 66.371989])

def normalize(data):
    # Normalize data
    mu = (maxs + mins)/2
    sigma = (maxs - mins)/np.sqrt(12)
    data = (data - mu)/sigma
    # Return
    return data

def undo_normalize(data):
    # Normalize data
    mu = (maxs + mins)/2.0
    sigma = (maxs - mins)/np.sqrt(12)
    data = data*sigma + mu
    # Return
    return data

**Oracle**

In [None]:
oracle_model = load_model("susyai.hdf5")
def oracle(data):
    #print(data, oracle_model.predict(data))
    return 1.0*(oracle_model.predict(data)[:,1] > 0.5)

**Generate data**

In [None]:
def generate_data(N, normalized=True):
    X = np.random.rand(N,19)
    X *= (maxs - mins)
    X += mins
    Xnormed = normalize(X)
    y = oracle(Xnormed).astype(np.float)
    if normalized:
        return (Xnormed,y)
    return (X,y)

**Train Model**

In [None]:
def train_model(Xtrain, ytrain):
    est = RandomForestClassifier(n_estimators=200, n_jobs=-1)
    est.fit(Xtrain, ytrain)
    return est

**Test Model**

In [None]:
def test_model(model, Xtest):
    #print(Xtest.shape)
    prediction = model.predict_proba(Xtest)[:,1]
    #print(prediction)
    info = 1 - 2*np.abs(prediction-0.5)
    return (prediction, info)

**Show model uncertainty results**

In [None]:
def uncertainty_results(X, truth, prediction, info_measure, x=0, y=2):
    print("{:<20}{}".format("truth.shape", ytest.shape))
    print("{:<20}{}".format("pred.shape", ypred.shape))
    print("{:<20}{}".format("info.shape", ysigma.shape))

    f, a = plt.subplots(2,2, figsize=(16,16))
    a[0,0].scatter(X[:,x], X[:,y], c=truth.ravel(), cmap="seismic_r")
    a[0,0].set_title("Truth")
    a[0,1].scatter(X[:,x], X[:,y], c=prediction.ravel(), cmap="seismic_r")
    a[0,1].set_title("Prediction")
    a[1,0].scatter(X[:,x], X[:,y], c=np.abs(truth-prediction), cmap="Reds")
    a[1,0].set_title("Difference")
    a[1,1].scatter(X[:,x], X[:,y], c=info_measure, cmap="Purples")
    a[1,1].set_title("Uncertainty")
    plt.show()

---
## Full grid search on step size and candidate pool size

In [None]:
size_start = 10000                   # Start size
list_iter = [500, 2500, 5000, 7500]                     # Number of data points added in each step
list_sample = [1e3, 5e3, 1e4, 5e4, 1e5]                   # Size of set to be checked for uncertainty
size_max = 100000                    # Maximum size of data set
size_test = 1000000                   # Size of test set
niterations = 7                      # Number of iterations

In [None]:
log_AL = open("log_active_learning_grid.csv", "w")
log_AL.write("stepsize,samplesize,iteration,size,bce,acc\n")
log_AL.flush()

def log_result(log, size_iter, size_sample, iteration, size, bce, acc):
    log.write("{},{},{},{},{},{}\n".format(size_iter, size_sample, iteration, size, bce, acc))
    log.flush()

In [None]:
for size_iter in list_iter:
    for size_sample in list_sample:
        size_sample = int(size_sample)
        if size_sample < size_iter:
            continue
        for iteration in range(niterations):
            print("STEP SIZE: {}".format(size_iter))
            print("SAMPLE SIZE: {}".format(size_sample))
            print("ITERATION {}".format(iteration))

            # Data set creation
            Xtrain_AL, ytrain_AL = generate_data(size_start)
            Xtrain_RS, ytrain_RS = deepcopy(Xtrain_AL), deepcopy(ytrain_AL)
            Xtest, ytest = generate_data(size_test)

            while len(Xtrain_AL) < size_max:

                """ ACTIVE LEARNING """
                # Train model for active learning
                model = train_model(Xtrain_AL, ytrain_AL)
                # Test model performance
                performance_AL = model_performance(model, Xtest, ytest)
                # Store performance
                log_result(log_AL, size_iter, size_sample, iteration, len(Xtrain_AL), performance_AL["bce"], performance_AL["acc"])
                # Active sampling of new points
                Xnew, ynew = active_sampling(model, size_sample, size_iter, random_fraction=0.0)
                # Append new points to active learning
                Xtrain_AL = np.vstack((Xtrain_AL, Xnew))
                ytrain_AL = np.hstack((ytrain_AL, ynew))

                """ LOG AND OUTPUT RESULTS """
                # Screen
                #print(Xtrain_AL.shape, Xtrain_RS.shape)
                print("stepsize: {:<5}    samplesize: {:5}    iteration: {:<3}  size: {:<5}    al-bce: {:<7}    al-acc: {:<7}".format(
                    size_iter,
                    size_sample,
                    iteration,
                    len(Xtrain_AL)-size_iter,
                    round(performance_AL["bce"],5),
                    round(performance_AL["acc"],5)
                ))
            # Output data sets to file
            al = np.hstack((Xtrain_AL, ytrain_AL.reshape(-1,1)))
            np.savetxt("arrays/step_{}_{}.csv".format(size_iter, iteration), al, delimiter=",")

---
## Plots

In [None]:
sns.set(palette=sns.hls_palette(8, l=.4))

### Grid search

In [None]:
# Load data
raw = pd.read_csv("log_active_learning_grid.csv")
# Get info from data for analysis
stepsizes = raw['stepsize'].value_counts().keys().sort_values()
samplesizes = raw['samplesize'].value_counts().keys().sort_values()
niterations = raw['iteration'].value_counts().keys().sort_values()

***Heatmap***

In [None]:
""" Prepare dataframe for heatmap """
# Initialise arrays
results = np.ones((len(stepsizes), len(samplesizes), 7))
results *= np.nan

# Fill arrays
for i,step in enumerate(stepsizes):
    for j,sample in enumerate(samplesizes):
        for k in niterations:
            try:
                results[i,j,k] = raw[(raw['stepsize']==step) & (raw['samplesize']==sample) & (raw['iteration']== k)]['acc'].iloc[-1]
            except Exception:
                pass

print(results)
# Create dataframe
df_mean = pd.DataFrame(np.mean(results, axis=2), columns=samplesizes, index=stepsizes)
df_diff = pd.DataFrame(np.std(results, axis=2), columns=samplesizes, index=stepsizes)

In [None]:
plt.figure(figsize=(10,6))
plt.title("Best gained accuracy (mean)")
cmap = sns.cubehelix_palette(start=2.8, rot=.0, reverse=True, as_cmap=True)
sns.heatmap(df_mean, vmin=0.887, vmax=0.919, linewidths=.5, cmap=cmap, square=True, annot=True, fmt='f', cbar_kws={"label":"accuracy"})
plt.xlabel("size_sample")
plt.ylabel("size_select")
plt.savefig("best_gained_accuracy.png", bbox_inches='tight')

In [None]:
for step_size in stepsizes:
    print("Step size: {}".format(step_size))
    df = raw[raw['stepsize']==step_size]
    samplesizes = df['samplesize'].value_counts()
    n_samplesizes = samplesizes.count()
    n_steps = samplesizes.iloc[0]
    n_iterations = df['iteration'].value_counts().count()

    # Create array
    accuracies = np.zeros((n_samplesizes, int(n_steps/n_iterations), n_iterations))

    # Fill array
    sizes = samplesizes.keys().sort_values()
    for i,size in enumerate(sizes):
        for iteration in range(n_iterations):
            accuracies[i, :, iteration] = df[df['samplesize'] == size][df['iteration'] == iteration]['acc']

    # Get x axis
    x = df['size'].value_counts().keys().sort_values()
    sns.set(palette=sns.hls_palette(n_samplesizes, l=.4))
    
    # Plot lines and bands
    plt.clf()
    plt.figure(figsize=(16,10))
    for i, size in enumerate(sizes):
        plt.plot(x,np.mean(accuracies[i], axis=1), label='sample size: {}'.format(size))
        band_min = np.amin(accuracies[i], axis=1)
        band_max = np.amax(accuracies[i], axis=1)
        plt.fill_between(x,band_min, band_max, alpha=0.3)
    plt.xlabel("Train size")
    plt.ylabel("Accuracy")
    plt.title("Accuracy development for different sample sizes (step size: {})".format(step_size))
    plt.legend()
    plt.show()