In [None]:
import numpy as np
from numpy.random import random
from numpy import vstack, hstack
import pandas as pd
from sklearn.datasets import make_blobs, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling, entropy_sampling, margin_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty, classifier_margin, classifier_entropy
from modAL.utils.selection import multi_argmax
from Models import models, plot, sampling, expgen
from Data.datasets import save_obj, load_obj, data_preprocess
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm

## Import experiment data, state space, and ML model

In [None]:
# import experiment data (2 initial sampling + 5 active learning)
df_std = pd.read_csv('Data/005.morph phase mapping.csv')
df_std.index = list(df_std['index'])
df_score = df_std.filter(['score'], axis = 1)
df_std = df_std.drop(['index', 'score'], axis = 1)
print('Total number of labeled training sets', len(df_std))

df_pool = load_obj('8R homogeneous concentration statespace (Pb2, morph, H2O and FAH constrained)')
df_pool_std = load_obj('8R homogeneous concentration statespace_standardized (Pb2, morph, H2O and FAH constrained)')
print('Total number of unlabeled training sets', len(df_pool_std))

# load the ML model
GPC = load_obj('GPC_best')

## KS algorithm: sampling the experimental space with low information density

### Plot query points

In [None]:
df_query = load_obj("8 reagent concentration_6th_standardized_KS")
query_idx = list(df_query.index)
df_score_6 = pd.read_csv('Data/6th KS_score.csv')
df_score_6.index = list(df_score_6['Index'])
df_score_6 = df_score_6.drop(['Index'], axis = 1)

df_pool_tSNE = pd.DataFrame(columns = ['dim 1', 'dim 2'], index = df_pool.index, data = (load_obj('tSNE_5th_50'))[:-168])
df_tSNE = df_pool_tSNE.filter(df_std.index, axis=0)

df_query_tSNE = df_pool_tSNE.filter(query_idx, axis=0)

In [None]:
# plot the t-SNE for the kS sampling
%matplotlib notebook

fig = plt.figure(figsize = (6,6))
ax = fig.add_subplot()

clist = ['blue','green','red','orange']
slist = [40, 40, 40, 40]
alphalist = [1,1,1,1]

for i in range(4):
    ax.scatter(df_tSNE['dim 1'][df_score['score'] == i+1], \
               df_tSNE['dim 2'][df_score['score'] == i+1], \
               c = clist[i], \
               s = slist[i], alpha = alphalist[i])
    
ax.scatter(df_query_tSNE['dim 1'], df_query_tSNE['dim 2'], \
           facecolors='none', edgecolors='black', linewidths = 1.5, s = 40)

plt.xlabel('t-SNE dim 1')
plt.ylabel('t-SNE dim 2')

In [None]:
%matplotlib notebook

fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot()

clist = ['blue','green','red','orange']

for i in range(4):
    ax.scatter(df_tSNE['dim 1'][df_score['score'] == i+1], \
               df_tSNE['dim 2'][df_score['score'] == i+1], \
               c = clist[i], \
               s = 60, alpha = 1)
    
for i in range(4):   
    ax.scatter(df_query_tSNE['dim 1'][df_score_6['score'] == i+1], \
               df_query_tSNE['dim 2'][df_score_6['score'] == i+1], \
               facecolors='none', edgecolors=clist[i], linewidths = 1.5, s = 60)

plt.xlim(-35,35)
plt.ylim(-35,35)
plt.xlabel('t-SNE dim 1')
plt.ylabel('t-SNE dim 2')
plt.savefig('Graphs/5AL_with KS exp marked_tSNE.svg', format = 'svg', transparent = True)

### Generate experiments

In [None]:
from Models.expgen import robot_file_gen_R8
robot_file_gen_R8(data = df_pool_query_vol, filename = '8R_6thKS_robotinput')

In [None]:
df_pool_query_vol

### Prediction accuracy of ML trained by 5 AL iterations on 6th (KS) exp.

In [None]:
# Calculate the prediction accuracy

GPC.fit(df_std, df_score)
pred_accuracy = GPC.score(df_query, df_score_6)
    
print('*'*20)
print('Prediction accuracy is ', pred_accuracy)
print('*'*20)

In [None]:
df_query_6_predict = GPC.predict(df_query)
df_query_6_predict

In [None]:
df_score_6.values.ravel()

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

%matplotlib notebook
confusion_matrix = pd.crosstab(df_score_6.values.ravel(), df_query_6_predict, rownames=['Exp outcomes'], colnames=['Predicted'])

sn.heatmap(confusion_matrix, cmap="YlGnBu", annot=True)
plt.show()
plt.savefig('Graphs/Confusion matrix of KS sampling exp.svg', format = "svg", transparent=True)

In [None]:
RF = load_obj('RandomForestClassifier_best')

In [None]:
# Calculate the prediction accuracy

RF.fit(df_std, df_score)
pred_accuracy_RF = RF.score(df_query, df_score_6)
    
print('*'*20)
print('Prediction accuracy is ', pred_accuracy)
print('*'*20)

In [None]:
df_query_6_predict_RF = RF.predict(df_query)
df_query_6_predict_RF

In [None]:
df_score_6.values.ravel()

In [None]:
df_query_6_predict

In [None]:
df_query_6_predict == df_query_6_predict_RF