# Infinite pool QBC

### Import packages
**Keras**

In [None]:
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Activation, Lambda
from keras.layers import Dropout
from keras.callbacks import History, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical
from keras import regularizers
from keras import initializers
import keras.backend as K

**Data Science**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.metrics as metrics
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
sns.set()

**Other**

In [None]:
from copy import deepcopy
from math import floor
import time

---
### Functions
**Plot Projection**

In [None]:
def plot_projection(data, classification, x=0, y=1, bins=100):
    # Calculate ranges and bins
    xmin, xmax = np.amin(data[:,x]), np.amax(data[:,x])
    ymin, ymax = np.amin(data[:,y]), np.amax(data[:,y])
    xbins = np.linspace(xmin, xmax, bins)
    ybins = np.linspace(ymin, ymax, bins)
    # Calculate two histograms
    allowed, _, _ = np.histogram2d(data[classification==1.0, x], data[classification==1.0, y], [xbins, ybins])
    excluded, _, _ = np.histogram2d(data[classification==0.0, x], data[classification==0.0, y], [xbins, ybins])
    # Calculate map
    mapping = allowed / (allowed + excluded)
    mapping = mapping.T
    mapping = np.flipud(mapping)
    # Plot
    f, a = plt.subplots(1,1,figsize=(8,8))
    a.matshow(mapping, extent=(xmin, xmax, ymin, ymax), cmap="seismic_r")
    print("({}, {})".format(np.amin(mapping),np.amax(mapping)))
    plt.show()

**Normalisation**

In [None]:
mins = np.array([-3999.99, -4000.0, 200.0, 90.879999, 91.419997, 90.02, 90.250001, 200.02999, 102.65, 200.00984, 264.01999, 200.0199, 100.06994, -7998.919, -3999.97, -3999.97, -4381.715, 1364.5547, 2.6724445])
maxs = np.array([3999.95, 3999.83, 3999.98, 4000.0, 3999.98, 3999.99, 4000.0, 3999.9899, 3999.9999, 3999.98, 3999.9999, 3999.97, 4000.0, 7993.83, 3999.95, 3999.94, 4132.4793, 37569812.0, 66.371989])

def normalize(data):
    # Normalize data
    mu = (maxs + mins)/2
    sigma = (maxs - mins)/np.sqrt(12)
    data = (data - mu)/sigma
    # Return
    return data

def undo_normalize(data):
    # Normalize data
    mu = (maxs + mins)/2.0
    sigma = (maxs - mins)/np.sqrt(12)
    data = data*sigma + mu
    # Return
    return data

**Oracle**

In [None]:
oracle_model = load_model("susyai.hdf5")
def oracle(data):
    #print(data, oracle_model.predict(data))
    return 1.0*(oracle_model.predict(data)[:,1] > 0.5)

**Generate data**

In [None]:
def generate_data(N, normalized=True):
    X = np.random.rand(N,19)
    X *= (maxs - mins)
    X += mins
    Xnormed = normalize(X)
    y = oracle(Xnormed).astype(np.float)
    if normalized:
        return (Xnormed,y)
    return (X,y)

**Train Model**

In [None]:
def train_model(Xtrain, ytrain):
    est = RandomForestClassifier(n_estimators=200, n_jobs=-1)
    est.fit(Xtrain, ytrain)
    return est

**Test Model**

In [None]:
def test_model(model, Xtest):
    #print(Xtest.shape)
    prediction = model.predict_proba(Xtest)[:,1]
    #print(prediction)
    info = 1 - 2*np.abs(prediction-0.5)
    return (prediction, info)

**Show model uncertainty results**

In [None]:
def uncertainty_results(X, truth, prediction, info_measure, x=0, y=2):
    print("{:<20}{}".format("truth.shape", ytest.shape))
    print("{:<20}{}".format("pred.shape", ypred.shape))
    print("{:<20}{}".format("info.shape", ysigma.shape))

    f, a = plt.subplots(2,2, figsize=(16,16))
    a[0,0].scatter(X[:,x], X[:,y], c=truth.ravel(), cmap="seismic_r")
    a[0,0].set_title("Truth")
    a[0,1].scatter(X[:,x], X[:,y], c=prediction.ravel(), cmap="seismic_r")
    a[0,1].set_title("Prediction")
    a[1,0].scatter(X[:,x], X[:,y], c=np.abs(truth-prediction), cmap="Reds")
    a[1,0].set_title("Difference")
    a[1,1].scatter(X[:,x], X[:,y], c=info_measure, cmap="Purples")
    a[1,1].set_title("Uncertainty")
    plt.show()

---
### Active learning
**Configuration**

In [None]:
size_start = 10000       # Start size
size_iter = 2500         # Number of data points added in each step
size_sample = 100000     # Size of set to be checked for uncertainty
size_max = 100000        # Maximum size of data set
size_val = 1000000       # Size of validation set
size_test = 1000000      # Size of test set
niterations = 7          # Number of iterations

**Active Sampling function**

In [None]:
def active_sampling(model, Nquery, Nselect, random_fraction=0.5):
    # Select actively
    select_active = round((1-random_fraction)*Nselect)
    if select_active > 0:
        # Get uncertainty measure
        X, _ = generate_data(Nquery)
        predictions, info = test_model(model, X)   
        keysort = np.argsort(info)[::-1]
    
        selected = X[keysort[:select_active]]
        method = np.zeros(Nselect)
        method[:select_active] = 1.0
    else:
        selected = None

    # Add random
    select_random = Nselect - select_active
    if select_random > 0:
        X, _ = generate_data(select_random)
        if selected is None:
            selected = X
        else:
            selected = np.vstack((selected, X))

    # Label and return
    prediction = oracle(selected)
    return (selected, prediction)

**Get model performance**

In [None]:
def model_performance(model, Xtest, ytest):
    ypred = model.predict(Xtest)
    bce = log_loss(ytest, ypred)
    acc = accuracy_score(ytest, 1.0*(ypred>0.5))
    return {"bce":bce, "acc":acc}

**Create logbooks**

In [None]:
log_AL = open("log_active_learning.csv", "w")
log_AL.write("iteration,size,bce,acc\n")
log_AL.flush()

log_RS = open("log_random_sampling.csv", "w")
log_RS.write("iteration,size,bce,acc\n")
log_RS.flush()

def log_result(log, iteration, size, bce, acc):
    log.write("{},{},{},{}\n".format(iteration, size, bce, acc))
    log.flush()

**Run Active Learning**

In [None]:
for iteration in range(niterations):
    print("ITERATION {}".format(iteration))

    # Data set creation
    Xtrain_AL, ytrain_AL = generate_data(size_start)
    Xtrain_RS, ytrain_RS = deepcopy(Xtrain_AL), deepcopy(ytrain_AL)
    Xval, yval = generate_data(size_val)
    Xtest, ytest = generate_data(size_test)

    while len(Xtrain_AL) < size_max:
        K.clear_session()
        
        """ ACTIVE LEARNING """
        # Train model for active learning
        model = train_model(Xtrain_AL, ytrain_AL)
        # Test model performance
        performance_AL = model_performance(model, Xtest, ytest)
        # Store performance
        log_result(log_AL, iteration, len(Xtrain_AL), performance_AL["bce"], performance_AL["acc"])
        # Active sampling of new points
        Xnew, ynew = active_sampling(model, size_sample, size_iter, random_fraction=0.0)
        # Append new points to active learning
        Xtrain_AL = np.vstack((Xtrain_AL, Xnew))
        ytrain_AL = np.hstack((ytrain_AL, ynew))
        
        """ RANDOM SAMPLING """
        # Train model for random sampling
        model = train_model(Xtrain_RS, ytrain_RS)
        # Test model performance
        performance_RS = model_performance(model, Xtest, ytest)
        # Store performance
        log_result(log_RS, iteration, len(Xtrain_RS), performance_RS["bce"], performance_RS["acc"])
        # Sample new points
        Xnew, ynew = generate_data(size_iter)
        # Append new points to random sampling
        Xtrain_RS = np.vstack((Xtrain_RS, Xnew))
        ytrain_RS = np.hstack((ytrain_RS, ynew))
        
        """ LOG AND OUTPUT RESULTS """
        # Screen
        #print(Xtrain_AL.shape, Xtrain_RS.shape)
        print("iteration: {:<5}     size: {:<5}    al-bce: {:<10}    al-acc: {:<10}    rs-bce: {:<10}    rs-acc: {:<10}".format(
            iteration,
            len(Xtrain_AL)-size_iter,
            round(performance_AL["bce"],6),
            round(performance_AL["acc"],6),
            round(performance_RS["bce"],6),
            round(performance_RS["acc"],6)
        ))
    # Output data sets to file
    al = np.hstack((Xtrain_AL, ytrain_AL.reshape(-1,1)))
    rs = np.hstack((Xtrain_RS, ytrain_RS.reshape(-1,1)))
    np.savetxt("arrays/active_{}.csv".format(iteration), al, delimiter=",")
    np.savetxt("arrays/random_{}.csv".format(iteration), rs, delimiter=",")