## Calculate CV accuracies for different RF and SVM_PUFK models, and choose the best model for AL

In [None]:
import numpy as np
from numpy.random import random
from numpy import vstack, hstack
import pandas as pd
from sklearn.datasets import make_blobs, make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, Matern
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling, entropy_sampling, margin_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty, classifier_margin, classifier_entropy
from modAL.utils.selection import multi_argmax
from Models import models, plot, sampling
from Data.datasets import save_obj, load_obj, data_preprocess
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Input initial 48 experiment
df = load_obj('8 reagent concentration_initial sampling_standardized')
df = df.rename(columns={'Pb': 'Pb_std', "morph": "morph_std", 'DMSO': 'DMSO_std', 'GBL': 'GBL_std', 'FAH': 'FAH_std', 'H2O': 'H2O_std'})
df_score = pd.read_csv('Data/initial sampling_score.csv')
df_score.index = list(df_score['Index'])
df_score = df_score.drop(['Index'], axis = 1)
print('The index of score table matches the index of feature table?', (df.index == df_score.index).all())

In [None]:
# crossvalidation options
X = df.copy()
y = df_score.copy()
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

### Fit data to Random Forest model and use grid search to find best hyperparameters.

In [None]:
bootstrap = [True, False]
max_features = ['auto', 'sqrt']
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [2, 5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 4, 8]

param_grid_RFC =dict(bootstrap=bootstrap,
                max_features=max_features,
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf)

In [None]:
grid_RFC = GridSearchCV(RandomForestClassifier(criterion='entropy', class_weight = 'balanced', random_state = 42),\
                    param_grid=param_grid_RFC,cv=cv,n_jobs = -1)
grid_RFC.fit(X,y)
print("Best hyperparameters: ", grid_RFC.best_params_)
print("Test accuracy from grid search is", grid_RFC.best_score_)
save_obj(grid_RFC, 'RandomForestClassifier_gridcv')

RFC = grid_RFC.best_estimator_
save_obj(RFC, 'RandomForestClassifier_best')

In [None]:
RFC.fit(X_train,y_train)
print("Metric Report")
print(classification_report(y_test,RFC.predict(X_test)))
print("Test accuracy is", RFC.score(X_test, y_test))
print("Test accuracy from grid search is", grid_RFC.best_score_)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RFC, X, y, cv=cv)

In [None]:
scores

In [None]:
np.mean(scores)

In [None]:
np.std(scores)

### Fit data to SVM PUFK model and use grid search to find best hyperparameters.

In [None]:
from scipy.spatial.distance import pdist, cdist, squareform
from Models.models import PearsonVII_kernel

C_range = np.logspace(-3, 3, 7)
param_grid_SVM = dict(C=C_range)
    
grid_SVM = GridSearchCV(SVC(cache_size=6000, kernel = PearsonVII_kernel, \
                            decision_function_shape='ovr', probability=True, class_weight='balanced'),\
                        param_grid=param_grid_SVM,cv=cv)

grid_SVM.fit(X,np.array(y).ravel())
print("Best hyperparameters: ", grid_SVM.best_params_)
print("Test accuracy from grid search is", grid_SVM.best_score_)
save_obj(grid_SVM, 'SVM_gridcv')
SVM = grid_SVM.best_estimator_
save_obj(SVM, 'SVM_best')

In [None]:
SVM.fit(X_train,y_train)
print("Metric Report")
print(classification_report(y_test, SVM.predict(X_test)))
print("Test accuracy is", SVM.score(X_test, y_test))
print("Test accuracy from grid search is", grid_SVM.best_score_)

In [None]:
load_obj('SVM_best')

#### Conclusion: Random Forest is better. So I will choose Random Forest for active learning
#### Other reasons: 1. RF is better for multi-class classification. 2. less prone to over fitting. 
#### 3. give prediction uncertainty intrinsically.

## After 5th AL, a question rises that Random Forest may not be the best model for active learning or for this dataset. So I am comming back here to generate some other models.

### Fit data to SVM RBF model and use grid search to find best hyperparameters.

In [None]:
C_range = np.logspace(-3, 3, 7)
gamma_range = np.logspace(-3, 3, 7)
param_grid_SVM_rbf = dict(C = C_range, gamma = gamma_range)
    
grid_SVM_rbf = GridSearchCV(SVC(cache_size=6000, kernel = 'rbf', \
                                decision_function_shape='ovr', probability=True, class_weight='balanced'),\
                            param_grid=param_grid_SVM_rbf,cv=cv)

grid_SVM_rbf.fit(X,np.array(y).ravel())
print("Best hyperparameters: ", grid_SVM_rbf.best_params_)
print("Test accuracy from grid search is", grid_SVM_rbf.best_score_)
save_obj(grid_SVM_rbf, 'SVM_rbf_gridcv')
SVM_rbf = grid_SVM_rbf.best_estimator_
save_obj(SVM_rbf, 'SVM_rbf_best')

In [None]:
SVM_rbf.fit(X_train,y_train)
print("Metric Report")
print(classification_report(y_test, SVM_rbf.predict(X_test)))
print("Test accuracy is", SVM_rbf.score(X_test, y_test))
print("Test accuracy from grid search is", grid_SVM_rbf.best_score_)

### Fit data to xgboost model.

In [None]:
import xgboost as xgb
xboost = xgb.XGBClassifier(base_score=0.5, booster='gbtree', n_estimators=100)
save_obj(xboost, 'xgboost_best')

xboost.fit(X_train,y_train)
print("Metric Report")
print(classification_report(y_test, xboost.predict(X_test)))
print("Test accuracy is", xboost.score(X_test, y_test))

In [None]:
load_obj('xgboost_best')

### Fit data to GaussianProcessClassifier.

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
GPC = GaussianProcessClassifier(1.0*RBF(1.0))
save_obj(GPC, 'GPC_best')

GPC.fit(X_train,y_train)
print("Metric Report")
print(classification_report(y_test, GPC.predict(X_test)))
print("Test accuracy is", GPC.score(X_test, y_test))

In [None]:
load_obj('GPC_best')

### Fit data to KNeighborsClassifier.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors = 1, weights = "distance", p = 2)
save_obj(kNN, 'kNN_best')

kNN.fit(X_train,y_train)
print("Metric Report")
print(classification_report(y_test, kNN.predict(X_test)))
print("Test accuracy is", kNN.score(X_test, y_test))

In [None]:
load_obj('kNN_best')

### Another GaussianProcessClassifier (popular, for testing)

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, Matern

kernel = 1.0 * Matern(length_scale=1.0, nu=2.5)
GPC = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer = 10)
save_obj(GPC, 'GPC_matern')