In [None]:
import numpy as np
from numpy.random import random
from numpy import vstack, hstack
import pandas as pd
from sklearn.datasets import make_blobs, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling, entropy_sampling, margin_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty, classifier_margin, classifier_entropy
from modAL.utils.selection import multi_argmax
from Models import models, plot, sampling, expgen
from Data.datasets import save_obj, load_obj, data_preprocess
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D

In [None]:
df_std = pd.read_csv('Data/005.morph phase mapping.csv')
df_std.index = list(df_std['index'])
df_score = df_std.filter(['score'], axis = 1)
df_std = df_std.drop(['index', 'score'], axis = 1)

df_SVM_rbf_6thAL = load_obj('8 reagent concentration_6thAL_standardized_SVM_rbf') # proposed by SVM, not tested yet.
df_RF_6thAL = load_obj('8 reagent concentration_6thAL_standardized_RF') # proposed by RF, not tested yet.

df_pool_std = load_obj('8R homogeneous concentration statespace_standardized (Pb2, morph, H2O and FAH constrained)')
df_pool = load_obj('8R homogeneous concentration statespace (Pb2, morph, H2O and FAH constrained)')

df = df_pool.filter(df_std.index, axis = 0)

df_pool_tSNE = pd.DataFrame(columns = ['dim 1', 'dim 2'], index = df_pool.index, data = (load_obj('tSNE_5th_50'))[:-168]) # the last 168 points in 'tSNE_5th_50' are not the pools, it is the 5th AL attached to it.
df_tSNE = df_pool_tSNE.filter(df_std.index, axis=0)

df_SVM_rbf_tSNE = df_pool_tSNE.filter(df_SVM_rbf_6thAL.index, axis=0) 
df_RF_tSNE = df_pool_tSNE.filter(df_RF_6thAL.index, axis=0) 

iteration = [48, 72, 96, 120, 144, 168]
# load the ML model
cf = load_obj('RandomForestClassifier_best') 
kNN = load_obj('kNN_best')
GPC = load_obj('GPC_best')
xgboost = load_obj('xgboost_best')
SVM_rbf = load_obj('SVM_rbf_best')
SVM_PUFK = load_obj('SVM_best')

# load reproducibility experiment
df_rep = load_obj("8 reagent concentration_repeat24_after4AL")
df_rep_score = pd.read_csv('Data/rep_score.csv')
df_rep_tSNE = df_pool_tSNE.filter(df_rep.index, axis=0)
index_irreproducible = [769693, 763381, 794228, 721604]

In [None]:
cf

In [None]:
# Calculate the uncertainty for every stage of active learning
uncernlst = []
for k in range(6):
    cf.fit(df_std[:iteration[k]], df_score[:iteration[k]])
    uncernlst.append(classifier_uncertainty(cf,df_pool_std).reshape((len(df_pool_std),1)))

# uncernlst_SVM_rbf = []
# for k in range(6):
#     SVM_rbf.fit(df_std[:iteration[k]], df_score[:iteration[k]])
#     uncernlst_SVM_rbf.append(classifier_uncertainty(SVM_rbf,df_pool_std).reshape((len(df_pool_std),1)))

In [None]:
# plot the t-SNE for every iterations of active learning
%matplotlib notebook
for k in range(6):
    fig = plt.figure(figsize = (10,10))
    ax = fig.add_subplot()

    clist = ['blue','green','red','orange']
    
    for i in range(4):
        ax.scatter(df_tSNE[:iteration[k]]['dim 1'][df_score['score'][:iteration[k]] == i+1],\
                   df_tSNE[:iteration[k]]['dim 2'][df_score['score'][:iteration[k]] == i+1],\
                   c = clist[i], s = 60, alpha = 1)
        
    plt.xlim(-35,35)
    plt.ylim(-35,35)
    plt.xlabel('t-SNE dim 1')
    plt.ylabel('t-SNE dim 2')
    plt.savefig('Graphs/'+str(k)+'AL_tSNE.svg', format = 'svg', transparent = True)

In [None]:
# plot the t-SNE for reproducibility experiment
%matplotlib notebook
k = 5
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot()

clist = ['blue','green','red','orange']

# all 5th AL runs
for i in range(4):
    ax.scatter(df_tSNE[:iteration[k]]['dim 1'][df_score['score'][:iteration[k]] == i+1],\
               df_tSNE[:iteration[k]]['dim 2'][df_score['score'][:iteration[k]] == i+1],\
               c = clist[i], s = 60, alpha = 1)
    
# reproducibility run
ax.scatter(df_rep_tSNE['dim 1'],\
           df_rep_tSNE['dim 2'],\
           facecolors='none', edgecolors='black', linewidths = 1, s = 150, alpha = 1, marker = 'o')

# the ones that we fail to reproduce
ax.scatter(df_pool_tSNE.filter(index_irreproducible, axis=0)['dim 1'],\
           df_pool_tSNE.filter(index_irreproducible, axis=0)['dim 2'],\
           c='black', s = 300, alpha = 0.5, marker = 'x')

plt.xlim(-35,35)
plt.ylim(-35,35)
plt.xlabel('t-SNE dim 1')
plt.ylabel('t-SNE dim 2')
plt.savefig('Graphs/5AL_mark reproducibility exp_tSNE.svg', format = 'svg', transparent = True)

In [None]:
# plot the t-SNE for every iterations of active learning, plus the next proposed sampling.
for k in range(5):
    
    fig = plt.figure(figsize = (10,10))
    ax = fig.add_subplot()
    
    for i in range(4):
        ax.scatter(df_tSNE[:iteration[k]]['dim 1'][df_score['score'][:iteration[k]] == i+1],\
                   df_tSNE[:iteration[k]]['dim 2'][df_score['score'][:iteration[k]] == i+1],\
                   c = clist[i], s = 60, alpha = 1)

    ax.scatter(df_tSNE[iteration[k]:iteration[k+1]]['dim 1'],\
               df_tSNE[iteration[k]:iteration[k+1]]['dim 2'],\
               c = 'gray', s = 60, alpha = 1)

    plt.xlim(-35,35)
    plt.ylim(-35,35)
    plt.xlabel('t-SNE dim 1')
    plt.ylabel('t-SNE dim 2')

    plt.savefig('Graphs/'+str(k)+'AL+next_tSNE.svg', format = 'svg', dpi = 1000, transparent = True)

In [None]:
# plot the t-SNE for individual iterations of active learning.
clist = ['blue','green','red','orange']
for k in range(5):
    
    fig = plt.figure(figsize = (10,10))
    ax = fig.add_subplot()
    
    for i in range(4):
        ax.scatter(df_tSNE[iteration[k]:iteration[k+1]]['dim 1'][df_score['score'][iteration[k]:iteration[k+1]] == i+1],\
                   df_tSNE[iteration[k]:iteration[k+1]]['dim 2'][df_score['score'][iteration[k]:iteration[k+1]] == i+1],\
                   c = clist[i], s = 60, alpha = 1)

    plt.xlim(-35,35)
    plt.ylim(-35,35)
    plt.xlabel('t-SNE dim 1')
    plt.ylabel('t-SNE dim 2')

    plt.savefig('Graphs/'+str(k+1)+'AL_outcomes_tSNE.svg', format = 'svg', dpi = 1000, transparent = True)

In [None]:
iteration = [48, 72, 96, 120, 144, 168]

In [None]:
# plot the t-SNE for every iterations of active learning with different color and shapes for different iterations.
%matplotlib notebook
clist = ['black','purple','blue','green','orange','red']
mlist = ['o', 'p', '^', 'D']
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot()

for i in range(4):
    ax.scatter(df_tSNE[:iteration[0]]['dim 1'][df_score['score'][:iteration[0]] == i+1],\
               df_tSNE[:iteration[0]]['dim 2'][df_score['score'][:iteration[0]] == i+1],\
               c = clist[0], marker = mlist[0], s = 60, alpha = 0.5)
    
for k in range(5):
    for i in range(4):
        ax.scatter(df_tSNE[iteration[k]:iteration[k+1]]['dim 1'],\
                   df_tSNE[iteration[k]:iteration[k+1]]['dim 2'],\
                   c = clist[k+1], marker = mlist[0], s = 60, alpha = 0.5)
        
plt.xlabel('t-SNE dim 1')
plt.ylabel('t-SNE dim 2')
plt.xlim(-35,35)
plt.ylim(-35,35)
plt.savefig('Graphs/All AL_tSNE_v1.svg', format = 'svg', dpi = 1000, transparent = True)
plt.show()

In [None]:
uncernlst[0].shape

In [None]:
%matplotlib inline
# Visualize the pool in tSNE


fig = plt.figure(figsize = (10,8))
ax = fig.add_subplot()
plt.style.context('seaborn-white')

im = ax.scatter(df_pool_tSNE['dim 1'], \
                df_pool_tSNE['dim 2'], \
                alpha = 0.1, c = 'gray', cmap='rainbow', s = 5)
fig.colorbar(im)

plt.xlim(-35,35)
plt.ylim(-35,35)
plt.xlabel('t-SNE dim 1')
plt.ylabel('t-SNE dim 2')

plt.savefig('Graphs/pool_tSNE.png', format = 'png', dpi = 1000, transparent = True)
plt.show()

In [None]:
%matplotlib inline
# Visualize the classification uncertainty for AL iterations

for k in range(6):
    
    fig = plt.figure(figsize = (10,8))
    ax = fig.add_subplot()
    plt.style.context('seaborn-white')
    
    im = ax.scatter(df_pool_tSNE['dim 1'], \
                    df_pool_tSNE['dim 2'], \
                    alpha = 0.2, c = list(uncernlst[k].ravel()), cmap='rainbow', s = 5)
    fig.colorbar(im)
    
    plt.xlim(-35,35)
    plt.ylim(-35,35)
    plt.xlabel('t-SNE dim 1')
    plt.ylabel('t-SNE dim 2')

    plt.savefig('Graphs/'+str(k)+'AL_uncertainty_tSNE.png', format = 'png', dpi = 1000, transparent = True)
    plt.show()

In [None]:
%matplotlib inline
# Visualize the classification uncertainty for AL iterations + AL sampling generated using the uncertainty map.

for k in range(5):
    
    fig = plt.figure(figsize = (10,8))
    ax = fig.add_subplot()
    plt.style.context('seaborn-white')
    
    im = ax.scatter(df_pool_tSNE['dim 1'], \
                    df_pool_tSNE['dim 2'], \
                    alpha = 0.2, c = list(uncernlst[k].ravel()), cmap='rainbow', s = 5)
    
    ax.scatter(df_tSNE[iteration[k]:iteration[k+1]]['dim 1'],\
               df_tSNE[iteration[k]:iteration[k+1]]['dim 2'],\
               facecolors='none', edgecolors='black', linewidths = 2, s = 50, alpha = 1)
        
    fig.colorbar(im)
    
    plt.xlim(-35,35)
    plt.ylim(-35,35)
    plt.xlabel('t-SNE dim 1')
    plt.ylabel('t-SNE dim 2')

    plt.savefig('Graphs/'+str(k)+'AL_uncertainty_next sampling_tSNE.png', format = 'png', dpi = 1000, transparent = True)
    plt.show()

In [None]:
# plot the t-SNE for 1st proposed active learning experiments for RF and SVM_RBF after initial sampling
%matplotlib notebook

fig = plt.figure(figsize = (6,6))
ax = fig.add_subplot()

clist = ['blue','green','red','orange']

k = 5


ax.scatter(df_RF_tSNE['dim 1'],\
           df_RF_tSNE['dim 2'],\
           c = 'black', s = 40, alpha = 1, label = 'Proposed by Random Forest') 

ax.scatter(df_SVM_rbf_tSNE['dim 1'],\
           df_SVM_rbf_tSNE['dim 2'],\
           s = 40, alpha = 1, facecolors='none', edgecolors='black', linewidths = 1.5,\
           label = 'Proposed by SVM_RBF')
ax.legend()
plt.xlim(-35,35)
plt.ylim(-35,35)
plt.xlabel('t-SNE dim 1')
plt.ylabel('t-SNE dim 2')
plt.savefig('Graphs/6th AL_tSNE_RFvsSVM_RBF.png', format = 'png', transparent = True)

In [None]:
save_obj(uncernlst, '8R homogeneous concentration statespace_uncertainty_RF (Pb2, morph, H2O and FAH constrained)')

In [None]:
df0 = load_obj('8 reagent concentration_initial sampling_standardized')
df1 = load_obj('8 reagent concentration_1stAL_standardized_RF').filter(['uncertainty'], axis = 1)
df2 = load_obj('8 reagent concentration_2ndAL_standardized_RF').filter(['uncertainty'], axis = 1) 
df3 = load_obj('8 reagent concentration_3rdAL_standardized_RF').filter(['uncertainty'], axis = 1)
df4 = load_obj('8 reagent concentration_4thAL_standardized_RF').filter(['uncertainty'], axis = 1)
df5 = load_obj('8 reagent concentration_5thAL_standardized_RF').filter(['uncertainty'], axis = 1)

In [None]:
print((min(df1['uncertainty']), max(df1['uncertainty'])),\
      (min(df2['uncertainty']), max(df2['uncertainty'])),\
      (min(df3['uncertainty']), max(df3['uncertainty'])),\
      (min(df4['uncertainty']), max(df4['uncertainty'])),\
      (min(df5['uncertainty']), max(df5['uncertainty'])))