In [None]:
import numpy as np
from numpy.random import random
from numpy import vstack, hstack
import pandas as pd
from sklearn.datasets import make_blobs, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.model_selection import cross_val_score
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling, entropy_sampling, margin_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty, classifier_margin, classifier_entropy
from modAL.utils.selection import multi_argmax
from Models import models, plot, sampling, expgen
from Data.datasets import save_obj, load_obj, data_preprocess
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

### Import data from csv

In [None]:
# choose the ML model
cf = load_obj('RandomForestClassifier_best')
kNN = load_obj('kNN_best')
GPC = load_obj('GPC_best')
xgboost = load_obj('xgboost_best')
SVM_rbf = load_obj('SVM_rbf_best')
SVM_PUFK = load_obj('SVM_best')
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

classifier = {'RF': cf,\
             'kNN': kNN,\
             'GPC': GPC,\
             'xgboost': xgboost,\
             'SVM_rbf': SVM_rbf,\
             'SVM_PUFK': SVM_PUFK}

# Import datasets
df_3vs4vs1 = pd.read_csv('Data/006.morph phase mapping.csv')
df_3vs4vs1.index = list(df_3vs4vs1['index'])
df_3vs4vs1 = df_3vs4vs1.drop(['index'], axis = 1)

df_34vs1 = pd.read_csv('Data/006.morph phase mapping_3and4vs1.csv')
df_34vs1.index = list(df_34vs1['index'])
df_34vs1 = df_34vs1.drop(['index'], axis = 1)

df_3vs4 = pd.read_csv('Data/006.morph phase mapping_3vs4.csv')
df_3vs4.index = list(df_3vs4['index'])
df_3vs4 = df_3vs4.drop(['index'], axis = 1)

prog = [48, 72, 96, 120, 144, 168, 192]
prog_3vs4 = load_obj('index of class 3&4 for each AL iter_6KS')

### Cross-vallidation score over whole dataset, with 6 classifiers.

In [None]:
for k,j in enumerate(classifier):
    
    df_acc = pd.DataFrame(index = np.arange(len(prog)), columns = ['mean', 'std'])
    
    for i in range(len(prog)):
        df_prog = df_3vs4vs1.iloc[0:prog[i],:]
        score = cross_val_score(classifier[j], df_prog.drop(['score'], axis = 1), df_prog['score'], cv=cv)
        df_acc['mean'].loc[i] = np.mean(score)
        df_acc['std'].loc[i] = np.std(score)        
    
    df_acc.to_csv(j+'_accuracy for 6AL.csv')

In [None]:
for i in range(len(prog_3vs4)):
    df_prog_3vs4 = df_3vs4.filter(prog_3vs4[i], axis = 0)
    df_prog_3vs4 = df_prog_3vs4.filter([], axis = 1)
    score = cross_val_score(GPC, df_prog_3vs4.drop(['score'], axis = 1), df_prog_3vs4['score'], cv=cv)
    print('Prediction score after', i, 'active learning run: ', np.mean(score))
    print('Prediction score std is', np.std(score))
    print('*'*30)

### Feature importance (permutation)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
# measure feature importance for df_34vs1

perm_imp_34vs1 = []

for train_index, test_index in tqdm(cv.split(df_34vs1.drop(['score'], axis = 1), df_34vs1.filter(['score'], axis = 1))):
    
    x_train = df_34vs1.drop(['score'], axis =1).iloc[train_index]
    y_train = df_34vs1.filter(['score'], axis = 1).iloc[train_index]
    
    x_test = df_34vs1.drop(['score'], axis =1).iloc[test_index]
    y_test = df_34vs1.filter(['score'], axis = 1).iloc[test_index]
    
    perm_imp = []
    
    cf.fit(x_train, y_train)
    
    for i in range(5):
        importance = PermutationImportance(cf).fit(x_test, y_test)
        perm_imp_34vs1.append(list(importance.feature_importances_))

In [None]:
perm_imp_34vs1 = np.array(perm_imp_34vs1)
perm_imp_34vs1_mean = np.mean(perm_imp_34vs1, axis = 0)
perm_imp_34vs1_std = np.std(perm_imp_34vs1, axis = 0)

df_feat_perm_34vs1 = pd.DataFrame(index = ['mean','std'], \
                            columns = df_34vs1.columns[:-1], \
                            data = [perm_imp_34vs1_mean, perm_imp_34vs1_std])
df_feat_perm_34vs1.to_csv('feature_importance_permutation_34vs1.csv')

In [None]:
df_feat_perm_34vs1

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
# measure feature importance for df_3vs4

perm_imp_3vs4 = []

for train_index, test_index in tqdm(cv.split(df_3vs4.drop(['score'], axis = 1), df_3vs4.filter(['score'], axis = 1))):
    
    x_train = df_3vs4.drop(['score'], axis =1).iloc[train_index]
    y_train = df_3vs4.filter(['score'], axis = 1).iloc[train_index]
    
    x_test = df_3vs4.drop(['score'], axis =1).iloc[test_index]
    y_test = df_3vs4.filter(['score'], axis = 1).iloc[test_index]
    
    perm_imp = []
    
    cf.fit(x_train, y_train)
    
    for i in range(5):
        importance = PermutationImportance(cf).fit(x_test, y_test)
        perm_imp_3vs4.append(list(importance.feature_importances_))

In [None]:
perm_imp_3vs4 = np.array(perm_imp_3vs4)
perm_imp_3vs4_mean = np.mean(perm_imp_3vs4, axis = 0)
perm_imp_3vs4_std = np.std(perm_imp_3vs4, axis = 0)

df_feat_perm_3vs4 = pd.DataFrame(index = ['mean','std'], \
                            columns = df_3vs4.columns[:-1], \
                            data = [perm_imp_3vs4_mean, perm_imp_3vs4_std])
df_feat_perm_3vs4.to_csv('feature_importance_permutation_3vs4.csv')

In [None]:
df_feat_perm_3vs4