In [None]:
import numpy as np
from numpy.random import random
from numpy import vstack, hstack
import pandas as pd
from sklearn.datasets import make_blobs, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling, entropy_sampling, margin_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty, classifier_margin, classifier_entropy
from modAL.utils.selection import multi_argmax
from Models import models, plot, sampling, expgen
from Data.datasets import save_obj, load_obj, data_preprocess
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D

### Import experiment data, state space, and ML model

In [None]:
# import experiment data and statesapce
df_init = load_obj('8 reagent concentration_initial sampling')  # tested points for plotting
df_1stAL = load_obj('8 reagent concentration_1stAL_RF').drop(['uncertainty'], axis = 1) # tested points for plotting
df = pd.concat([df_init, df_1stAL])

df_init_std = load_obj('8 reagent concentration_initial sampling_standardized') # standardized tested points for training ML model
df_1stAL_std = load_obj('8 reagent concentration_1stAL_standardized_RF').drop(['uncertainty'], axis = 1) # standardized tested points for training ML model
df_std = pd.concat([df_init_std, df_1stAL_std])

# Rename the feature names of standardized tested points
df_score_init = pd.read_csv('Data/initial sampling_score.csv')
df_score_1stAL = pd.read_csv('Data/1st AL_score.csv')
df_score = pd.concat([df_score_init, df_score_1stAL])
df_score.index = list(df_score['Index']) # rebuilt scores and using "index" column as the index of this dataframe
df_score = df_score.drop(['Index'], axis = 1) # then remove the "index" column

# load concentration statespace: real one and standardized one
df_pool = load_obj('8R homogeneous concentration statespace (Pb2, morph, H2O and FAH constrained)')
df_pool_std = load_obj('8R homogeneous concentration statespace_standardized (Pb2, morph, H2O and FAH constrained)')

print('The index of score table matches the index of feature table?', (df.index == df_score.index).all())
df_pool = df_pool.drop(index = list(df_std.index)) # remove tested points from concentration statespace
df_pool_std = df_pool_std.drop(index = list(df_std.index)) # remove tested points from standardized concentration statespace

In [None]:
# import optimized machine learning model (with preset HPs)
from Models.models import PearsonVII_kernel
cf = load_obj('RandomForestClassifier_best') # choose the ML model
# fit the machine learning model
cf.fit(df_std,df_score)
save_obj(cf,'RFClassifier_2AL_trained_12152020')

### Active learning: uncertainty query

In [None]:
# Calculate prediction uncertainty for all points in the pool.
uncernlst = classifier_uncertainty(cf,df_pool_std).reshape((len(df_pool_std),1))

# Add uncertainty value to both pool and standardized pool
df_pool_std['uncertainty'] = uncernlst
df_pool['uncertainty'] = uncernlst
# save_obj(pool,'pool coordinations_after test exp_10152020 + uncertainty')

confidence = classifier_margin(cf,df_pool_std.drop(['uncertainty'], axis = 1))
confidence = sum(confidence)/len(confidence)
print('Current confidence is', confidence)

In [None]:
# Plot grid point ID vs uncertainty
%matplotlib notebook
fig = plt.figure(dpi=100)
ax = fig.add_subplot()
ax.scatter(np.arange(len(df_pool)), df_pool.sort_values(['uncertainty'])['uncertainty'], s = 1, c = 'red')
ax.set_xlabel('Grid points ID')
ax.set_ylabel('Uncertainty')
# plt.savefig('Graphs/Grid point ID (1 to 1674) vs uncertainty_1stAL.svg', format = "svg", transparent=True)

In [None]:
# Plot gird point in 3D space, colored by uncertainty
%matplotlib notebook
cm = plt.cm.get_cmap('jet')
fig = plt.figure(figsize=(7, 4.5),dpi=100)
ax = fig.add_subplot(111, projection='3d')
im = ax.scatter(df_pool.sort_values(['uncertainty'])['Pb'],\
                df_pool.sort_values(['uncertainty'])['morph'],\
                df_pool.sort_values(['uncertainty'])['FAH'],\
                c = df_pool.sort_values(['uncertainty'])['uncertainty'],\
                cmap = cm, alpha = 0.5)
cax = fig.add_axes()
fig.colorbar(im, cax=cax, orientation='vertical')
#plt.savefig('Graphs/Grid point 3D with uncertainty_1stAL.svg', format = "svg", transparent=True)
plt.show()

In [None]:
# Plot uncertainty distribution.
tier0 = []
tier1 = []
tier2 = []
tier3 = []
tier4 = []
tier5 = []
tier6 = []
tier7 = []
for i in uncernlst.ravel():
    if i>=0.7:
        tier7.append(i)
    elif i>=0.6:
        tier6.append(i)
    elif i>=0.5:
        tier5.append(i)
    elif i>=0.4:
        tier4.append(i)
    elif i>=0.3:
        tier3.append(i)
    elif i>=0.2:
        tier2.append(i)
    elif i>=0.1:
        tier1.append(i)
    else:
        tier0.append(i)

groupnumber = [len(tier0), len(tier1),len(tier2),len(tier3),len(tier4),len(tier5),len(tier6),len(tier7)]
fig, ax = plt.subplots()
plt.bar(list(range(8)),groupnumber)
plt.ylim(0, 140000)
plt.xlabel('uncertainty')
plt.ylabel('counts')
plt.xticks(list(range(8)), ('<0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', "0.5-0.6", "0.6-0.7", ">=0.7"))
#plt.savefig('Graphs/Grid point uncertainty distribution.svg', format = "svg", transparent=True)
plt.show()

print("number of points with >0.5 uncertainty (_2ndAL): ", len(tier5)+len(tier6)+len(tier7))

Query certain number of points 

In [None]:
# Use diverse mini-batch active learning
k = 24 # number of query points
beta = (len(tier5)+len(tier6)+len(tier7))//k # beta factor selection

minbatch = df_pool_std.nlargest(n = beta*k, columns = 'uncertainty') # pick the top k*beta points based on uncertainty

In [None]:
# use k-means clustering to find k centorid points out of k*beta points
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = k, random_state=42)
kmeans.fit(minbatch.iloc[:,:6],sample_weight=minbatch.iloc[:,6])
centers = kmeans.cluster_centers_ # k centorid points (not necessary to be within k*beta points)

In [None]:
# Find the nearest neighbor in the pool to the centorid points of k-means clustering
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=1, algorithm='ball_tree') # set neighbor number to be 1
neigh.fit(np.array(minbatch.iloc[:,:6])) # fit the model with top k*beta points
query_idx = neigh.kneighbors(centers)[1] # find the index of nearest neighbor in the pool

In [None]:
query_idx.ravel()

In [None]:
# index the pool for query, has to be the sorted version pool_uncerlst, not the initial version
df_pool_query = df_pool.loc[minbatch.iloc[query_idx.ravel()].index]
df_pool_std_query = df_pool_std.loc[minbatch.iloc[query_idx.ravel()].index]

df_pool_query_vol = load_obj('8R homogeneous volume statespace (Pb2, morph, H2O and FAH constrained)')
df_pool_query_vol = df_pool_query_vol.loc[minbatch.iloc[query_idx.ravel()].index]

In [None]:
save_obj(df_pool_query,"8 reagent concentration_2ndAL_RF")
save_obj(df_pool_std_query,"8 reagent concentration_2ndAL_standardized_RF")

### Plot query points

In [None]:
from Models import plot

df_plot = df_pool_query.filter(['Pb','morph','FAH'])
df_plot['crystal score'] = [5]*24

%matplotlib notebook
plot.plot3d2d(point=np.array(df_plot), x_range = [0, 3], y_range = [0, 5], z_range = [0, 16], \
              xy_loc = -6, xz_loc = 2, yz_loc = -2,\
              x_step = 0.5, y_step = 0.5, z_step = 2, elev = 30, azim = -60, name = '8R_1stAL_pb_morph_FAH')

In [None]:
### Visualize sampling using PCA
from sklearn.decomposition import PCA, KernelPCA

pca = PCA(n_components = 2)
pca.fit(df_pool_std.drop(['uncertainty'],axis=1))
df_pool_std_PCA_2 = pca.transform(df_pool_std.drop(['uncertainty'],axis=1))
df_pool_std_query_2AL_PCA_2 = pca.transform(df_pool_std_query.drop(['uncertainty'],axis=1))

%matplotlib notebook
fig = plt.figure()
ax = fig.add_subplot(111)

# plot whole dataset 
ax.scatter(df_pool_std_PCA_2[:, 0], \
            df_pool_std_PCA_2[:, 1], \
            c = 'gray', \
            s = 1, alpha = 0.5)

# plot the initial sampling
ax.scatter(df_pool_std_query_2AL_PCA_2[:, 0], \
            df_pool_std_query_2AL_PCA_2[:, 1], \
            c = 'red', \
            s = 10, alpha = 1)
plt.xlabel('Component 1')
plt.ylabel('Component 2')

### Generate experiments

In [None]:
from Models.expgen import robot_file_gen_R8
robot_file_gen_R8(data = df_pool_query_vol, filename = '8R_2ndAL_robotinput')

In [None]:
df_pool_query_vol