In [None]:
import numpy as np
from numpy.random import random
from numpy import vstack, hstack
import pandas as pd
from sklearn.datasets import make_blobs, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling, entropy_sampling, margin_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty, classifier_margin, classifier_entropy
from modAL.utils.selection import multi_argmax
from Models import models, plot, sampling, expgen
from Data.datasets import save_obj, load_obj, data_preprocess
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D

## Import experiment data, state space, and ML model

In [None]:
# import experiment data and statesapce
df_init = load_obj('8 reagent concentration_initial sampling')  # initial concentration
df_1stAL = load_obj('8 reagent concentration_1stAL_RF').drop(['uncertainty'], axis = 1) # 1st AL concentration
df_2ndAL = load_obj('8 reagent concentration_2ndAL_RF').drop(['uncertainty'], axis = 1) # 2nd AL concentration
df_3rdAL = load_obj('8 reagent concentration_3rdAL_RF').drop(['uncertainty'], axis = 1) # 3rd AL concentration
df_4thAL = load_obj('8 reagent concentration_4thAL_RF').drop(['uncertainty'], axis = 1) # 4th AL concentration
df = pd.concat([df_init, df_1stAL, df_2ndAL, df_3rdAL, df_4thAL])

df_init_std = load_obj('8 reagent concentration_initial sampling_standardized') # standardized initial concentration
df_1stAL_std = load_obj('8 reagent concentration_1stAL_standardized_RF').drop(['uncertainty'], axis = 1) # standardized 1st AL concentration
df_2ndAL_std = load_obj('8 reagent concentration_2ndAL_standardized_RF').drop(['uncertainty'], axis = 1) # standardized 2nd AL concentration
df_3rdAL_std = load_obj('8 reagent concentration_3rdAL_standardized_RF').drop(['uncertainty'], axis = 1) # standardized 3rd AL concentration
df_4thAL_std = load_obj('8 reagent concentration_4thAL_standardized_RF').drop(['uncertainty'], axis = 1) # standardized 4th AL concentration
df_std = pd.concat([df_init_std, df_1stAL_std, df_2ndAL_std, df_3rdAL_std, df_4thAL_std])

# Rename the feature names of standardized tested points
df_score_init = pd.read_csv('Data/initial sampling_score.csv')
df_score_1stAL = pd.read_csv('Data/1st AL_score.csv')
df_score_2ndAL = pd.read_csv('Data/2nd AL_score.csv')
df_score_3rdAL = pd.read_csv('Data/3rd AL_score.csv')
df_score_4thAL = pd.read_csv('Data/4th AL_score.csv')
df_score = pd.concat([df_score_init, df_score_1stAL, df_score_2ndAL, df_score_3rdAL, df_score_4thAL])
df_score.index = list(df_score['Index']) # rebuilt scores and using "index" column as the index of this dataframe
df_score = df_score.drop(['Index'], axis = 1) # then remove the "index" column

print('The index of feature table matches the index of standardized feature table?', (df.index == df_std.index).all())
print('The index of score table matches the index of feature table?', (df.index == df_score.index).all())
print('Total number of training sets', len(df.index))

In [None]:
# load concentration statespace: real one and standardized one
df_pool = load_obj('8R homogeneous concentration statespace (Pb2, morph, H2O and FAH constrained)')
df_pool_std = load_obj('8R homogeneous concentration statespace_standardized (Pb2, morph, H2O and FAH constrained)')

df_pool = df_pool.drop(index = list(df_std.index)) # remove tested points from concentration statespace
df_pool_std = df_pool_std.drop(index = list(df_std.index)) # remove tested points from standardized concentration statespace

In [None]:
# import optimized machine learning model (with preset HPs)
from Models.models import PearsonVII_kernel
cf = load_obj('RandomForestClassifier_best') # choose the ML model
# fit the machine learning model
cf.fit(df_std,df_score)
save_obj(cf,'RFClassifier_4AL_trained_01052021')

## Active learning: uncertainty query

### Calculate and visualize prediction uncertainty

In [None]:
# Calculate prediction uncertainty for all points in the pool.
uncernlst = classifier_uncertainty(cf,df_pool_std).reshape((len(df_pool_std),1))

# Add uncertainty value to both pool and standardized pool
df_pool_std['uncertainty'] = uncernlst
df_pool['uncertainty'] = uncernlst

# calculate average prediction confidence
confidence = classifier_margin(cf,df_pool_std.drop(['uncertainty'], axis = 1))
confidence = sum(confidence)/len(confidence)
print('Current confidence is', confidence)

In [None]:
# Plot grid point ID vs uncertainty
%matplotlib notebook
fig = plt.figure(dpi=100)
ax = fig.add_subplot()
ax.scatter(np.arange(len(df_pool)), df_pool.sort_values(['uncertainty'])['uncertainty'], s = 1, c = 'red')
ax.set_xlabel('Grid points ID')
ax.set_ylabel('Uncertainty')

In [None]:
uncern_tier = plot.uncert_bar(uncernlst = uncernlst, ylim = 140000)

### Query pool based on top prediction uncertainty (>50%)

In [None]:
# number of query points
k = 24 

In [None]:
# Use diverse mini-batch active learning
beta = (uncern_tier[5]+uncern_tier[6]+uncern_tier[7])//k # beta factor selection
minbatch = df_pool_std.nlargest(n = beta*k, columns = 'uncertainty') # pick the top k*beta points based on uncertainty

# use k-means clustering to find k centorid points out of k*beta points
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = k, random_state=42)
kmeans.fit(minbatch.iloc[:,:6],sample_weight=minbatch.iloc[:,6])
centers = kmeans.cluster_centers_ # k centorid points (not necessary to be within k*beta points)

# Find the nearest neighbor in the pool to the centorid points of k-means clustering
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=1, algorithm='ball_tree') # set neighbor number to be 1
neigh.fit(np.array(minbatch.iloc[:,:6])) # fit the model with top k*beta points
query_idx = neigh.kneighbors(centers)[1] # find the index of nearest neighbor in the pool

# index the pool for query, has to be the sorted version pool_uncerlst, not the initial version
df_pool_query = df_pool.loc[minbatch.iloc[query_idx.ravel()].index]
df_pool_std_query = df_pool_std.loc[minbatch.iloc[query_idx.ravel()].index]

df_pool_query_vol = load_obj('8R homogeneous volume statespace (Pb2, morph, H2O and FAH constrained)')
df_pool_query_vol = df_pool_query_vol.loc[minbatch.iloc[query_idx.ravel()].index]

save_obj(df_pool_query,"8 reagent concentration_5thAL_RF")
save_obj(df_pool_std_query,"8 reagent concentration_5thAL_standardized_RF")

### Plot query points

In [None]:
### Visualize sampling using PCA
from sklearn.decomposition import PCA, KernelPCA

pca = PCA(n_components = 2)
pca.fit(df_pool_std.drop(['uncertainty'],axis=1))
df_pool_std_PCA_2 = pca.transform(df_pool_std.drop(['uncertainty'],axis=1))
df_pool_std_query_3AL_PCA_2 = pca.transform(df_pool_std_query.drop(['uncertainty'],axis=1))

%matplotlib notebook
fig = plt.figure()
ax = fig.add_subplot(111)

# plot whole dataset 
ax.scatter(df_pool_std_PCA_2[:, 0], \
            df_pool_std_PCA_2[:, 1], \
            c = 'gray', \
            s = 1, alpha = 0.5)

# plot the initial sampling
ax.scatter(df_pool_std_query_3AL_PCA_2[:, 0], \
            df_pool_std_query_3AL_PCA_2[:, 1], \
            c = 'red', \
            s = 10, alpha = 1)
plt.xlabel('Component 1')
plt.ylabel('Component 2')

### Generate experiments

In [None]:
from Models.expgen import robot_file_gen_R8
robot_file_gen_R8(data = df_pool_query_vol, filename = '8R_5thAL_robotinput')

In [None]:
df_pool_query