### Use active learning algorithm (exact the same) from AL morph phase mapping
### on synthetic dataset
### The synthetic dataset should be as similar to real morph phase mapping data as possible.
### Initial idea is to have: 6 dimensions, 3 classes, unknown cluster number, tunable overlaps and noise level.

In [2]:
import numpy as np
import pandas as pd
from Models import sampling
from Models import AL
from Data.datasets import save_obj, load_obj
from tqdm import tqdm
from modAL.uncertainty import classifier_uncertainty
from modAL.uncertainty import classifier_margin
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, KFold
import random
import warnings
warnings.filterwarnings('always')  # "error", "ignore", "always", "default", "module" or "once"

In [None]:
# Import experiment datasets
df_exp = pd.read_csv('Data/006.morph phase mapping.csv')
df_exp.index = list(df_exp['index'])
df_exp = df_exp.drop(['index'], axis = 1)

class_counts = df_exp['score'].value_counts()
# We will use the class weights from experiment data.
weights = [class_counts[1]/len(df_exp),class_counts[3]/len(df_exp),class_counts[4]/len(df_exp)]

# set the list of dataset parameters.
class_sep_lst = [0.2, 0.5, 0.8]
flip_y_lst = [0.1, 0.2, 0.5]
xv,yv = np.meshgrid(class_sep_lst, flip_y_lst)
noise_lst = list(zip(xv.ravel(), yv.ravel()))

### Create synthetic dataset with tunable overlps and noise level.
dataset_dict = {}
from sklearn.datasets import make_classification
for noise in noise_lst:
    X, y = make_classification(n_samples=50000, n_features=6, n_redundant=0, n_repeated=0,\
                               n_informative=6, n_classes=3, n_clusters_per_class=1,\
                               weights=weights, class_sep = noise[0], random_state=1, flip_y = noise[1])
    dataset = pd.DataFrame(columns = ['a','b','c','d','e','f'], data = X)
    dataset['score'] = y
    dataset_dict[noise] = dataset

save_obj(dataset_dict, 'Artificial data_dataset')

### Calculate AL metrics and performance ###

In [None]:
# iterate through all the datasets in the dictionary

dataset_labeled_dict = {}
metrics_dict = {}
intrin_err_dict = {}
labeled_intrin_err_dict = {}
classifier = 'GPC_matern' # SVM_best, RandomForestClassifier_best, GPC_best, GPC_matern

cf = load_obj(classifier)
cv = KFold(shuffle = True, n_splits=5, random_state=42)
#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for noise in tqdm(dataset_dict.keys()):
    df = dataset_dict[noise]
    df_X = df.drop(['score'], axis = 1)
    df_y = df.filter(['score'], axis = 1)
    
    ## select initial training data using Kennard-Stone algorithm
    k = 20
    init_idx, _ = sampling.kennardstonealgorithm(df_X, k)
    print('finished initial sampling')

    # use the sample_idx to select initial sampling from df_X
    df_X_init = df_X.iloc[init_idx]
    df_y_init = df_y.iloc[init_idx]
    
    ############### Active learning ##############################
    iteration_num = 18
    number_periter = 5
    iteration_step = list(np.arange(k,k+(1+iteration_num)*number_periter,number_periter))
    
    df_X_labeled = df_X_init.copy()
    df_y_labeled = df_y_init.copy()
    
    for iteration in tqdm(range(iteration_num)):
        df_X_unlabeled = df_X.drop(df_X_labeled.index, axis = 'index')
        df_X_AL = AL.minibatch_AL(pool = df_X_unlabeled, X_label = df_X_labeled, y_label = df_y_labeled, \
                                  model = cf, numb_periter = number_periter)
        df_X_AL = df_X_AL.drop(['uncertainty'], axis = 1)
        df_X_labeled = pd.concat([df_X_labeled, df_X_AL])
        df_y_labeled = df_y.filter(df_X_labeled.index, axis = 'index')
    
    df_labeled = pd.concat([df_X_labeled, df_y_labeled], axis = 1)
    dataset_labeled_dict[noise] = df_labeled # save the AL labeled dataframe
    print('finished active learning')
    ################################################################
                           
    ############### Active learning metrics measurement ##############################
    # Calculate overall uncertainty (average uncertainty)
    avg_uncert = []
    for i in range(len(iteration_step)):
        cf.fit(np.array(df_X_labeled[:iteration_step[i]]), np.array(df_y_labeled[:iteration_step[i]]).ravel())
        uncert = classifier_uncertainty(cf,np.array(df_X))
        avg_uncert.append(sum(uncert)/len(uncert))
    print('finished overall uncertainty')
    
    # Calculate selected accuracy
    select_accuracy = []
    for i in range(len(iteration_step)-1):
        cf.fit(np.array(df_X_labeled[:iteration_step[i]]), np.array(df_y_labeled[:iteration_step[i]]).ravel())
        select_accuracy.append(cf.score(np.array(df_X_labeled[iteration_step[i]:iteration_step[i+1]]), \
                                        np.array(df_y_labeled[iteration_step[i]:iteration_step[i+1]]).ravel()))
    print('finished selected accuracy')
    
    # Calculate prediction confidence (average)
    pred_confid = []
    for i in range(len(iteration_step)):
        cf.fit(np.array(df_X_labeled[:iteration_step[i]]), np.array(df_y_labeled[:iteration_step[i]]).ravel())
        confidence = classifier_margin(cf, np.array(df_X))
        avg_confid = sum(confidence)/len(confidence)
        pred_confid.append(avg_confid)
    print('finished prediction confidence')
                           
    # Calculate contradictory information
    contra_info = []
    incor_confid_lst = []
    for i in range(len(iteration_step)-1):
        cf.fit(np.array(df_X_labeled[:iteration_step[i]]), np.array(df_y_labeled[:iteration_step[i]]).ravel())
        confidence = classifier_margin(cf, np.array(df_X))
        avg_confid = sum(confidence)/len(confidence)
                           
        incor_boolen = cf.predict(np.array(df_X_labeled[iteration_step[i]:iteration_step[i+1]])) != np.array(df_y_labeled[iteration_step[i]:iteration_step[i+1]]).ravel()
        if incor_boolen.any():
            incor_confid = classifier_margin(cf, df_X_labeled[iteration_step[i]:iteration_step[i+1]][incor_boolen])
            
            incor_confid_lst.append(incor_confid)
            contra_info.append(sum(incor_confid)/avg_confid)
        else:
            contra_info.append(0) 
    print('finished contradictory information')
    
    
    # Calculate cross validataion score
    df_acc = pd.DataFrame(index = np.arange(len(iteration_step)), columns = ['cross_mean', 'cross_std'])

    for i in range(len(iteration_step)):
        # for initial sampling, we have one class 1, so when we do cross_val, 
        # we will face the situation that no class 1 in the training set, that is not gonna work for SVM or GPC
        if ((classifier == 'SVM_best') or (classifier == 'GPC_best') or (classifier == 'GPC_matern')) & (i == 0): 
            df_acc['cross_mean'].loc[i] = 0
            df_acc['cross_std'].loc[i] = 0
        else:
            score = cross_val_score(cf, np.array(df_X_labeled[:iteration_step[i]]), \
                                    np.array(df_y_labeled[:iteration_step[i]]).ravel(), cv=cv)

            df_acc['cross_mean'].loc[i] = np.mean(score)
            df_acc['cross_std'].loc[i] = np.std(score)
    print('finished cross validataion score')

    
    # Calculate prediction accuracy of the whole dataset
    acc_pool = []
    for i in range(len(iteration_step)):   
        cf.fit(np.array(df_X_labeled[:iteration_step[i]]), np.array(df_y_labeled[:iteration_step[i]]).ravel())
        acc_pool.append(cf.score(np.array(df_X), np.array(df_y).ravel()))
    print('finished prediction accuracy of the whole dataset')
    
    # summerize all AL metrics                       
    AL_metrics = pd.DataFrame(columns = ['Overal Uncertainty','Prediction Cofindence',\
                                     'Selected Accuracy', 'Contradictory Information',\
                                     'Prediction Accuracy'])
    AL_metrics['Overal Uncertainty'] = avg_uncert
    AL_metrics['Prediction Cofindence'] = pred_confid
    AL_metrics['Selected Accuracy'].iloc[:-1] = select_accuracy
    AL_metrics['Contradictory Information'].iloc[1:] = contra_info
    AL_metrics['Prediction Accuracy'] = acc_pool
    AL_metrics = pd.concat([AL_metrics, df_acc], axis = 1)

    metrics_dict[noise] = AL_metrics # save the AL metrics dataframe
    ###################################################################################
        
            
    ############### Calculate the intrinsic error rate of whole datasets###############
    # Generate a list of number of hidden units
    hidden_unit = np.logspace(start = 0.5, stop = 2, num = 20)
    hidden_unit = list(set([int(x) for x in hidden_unit]))
    hidden_unit.sort()
    error_mean = []
    error_std = []

    from sklearn.neural_network import MLPClassifier
    #cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    for num_unit in hidden_unit:
            MLP = MLPClassifier(hidden_layer_sizes = (num_unit,), solver='lbfgs', activation = 'relu')
            accuracy = cross_val_score(MLP, np.array(df_X), np.array(df_y).ravel(), cv=cv)
            error = 1-accuracy
            error_mean.append(np.mean(error))
            error_std.append(np.std(error))

    df_intrin_error = pd.DataFrame()
    df_intrin_error['hidden_unit'] = hidden_unit
    df_intrin_error['error_mean'] = error_mean
    df_intrin_error['error_std'] = error_std
    
    intrin_err_dict[noise] = df_intrin_error
    print('finished the intrinsic error rate')
    ###################################################################################
    
    
    
    ############### Calculate the intrinsic error rate of AL datasets #################   
    error_mean = []
    error_std = []
    
    for num_unit in hidden_unit:
            MLP = MLPClassifier(hidden_layer_sizes = (num_unit,), solver='lbfgs', activation = 'relu')
            accuracy = cross_val_score(MLP, X = np.array(df_X_labeled), \
                                       y = np.array(df_y_labeled).ravel(), \
                                       scoring='accuracy', cv=cv)
            error = 1-accuracy
            error_mean.append(np.mean(error))
            error_std.append(np.std(error))

    df_labeled_intrin_error = pd.DataFrame()
    df_labeled_intrin_error['hidden_unit'] = hidden_unit
    df_labeled_intrin_error['error_mean'] = error_mean
    df_labeled_intrin_error['error_std'] = error_std
    
    labeled_intrin_err_dict[noise] = df_labeled_intrin_error
    print('finished the intrinsic error rate of AL datasets')
    ###################################################################################

############################ save the calculations ###########################################
save_obj(dataset_labeled_dict, 'Artificial data_AL labeled dataset_circle_2D_GPC_matern')
save_obj(metrics_dict, 'Artificial data_AL metrics_circle_2D_GPC_matern')
save_obj(intrin_err_dict, 'Artificial data_intrinsic error_circle_2D_GPC_matern')   
save_obj(labeled_intrin_err_dict, 'Artificial data_labeled_intrinsic error_circle_2D_GPC_matern')

In [None]:
metrics_dict[0].to_csv('metrics_circle_2D_0_GPC_matern.csv')

### Calculate F1, recall, and precision in each iteraction of active learning ###

In [None]:
# iterate through all the datasets in the dictionary

dataset_labeled_dict = {}
model_metrics_lst = {}
model_metrics_pool_lst = {}

classifier = 'GPC_matern' # SVM_best, RandomForestClassifier_best, GPC_best, GPC_matern

from sklearn.metrics import f1_score, precision_score, recall_score

cf = load_obj(classifier)
cv = KFold(shuffle = True, n_splits=5, random_state=42)
#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for noise in tqdm(dataset_dict.keys()):
    df = dataset_dict[noise]
    df_X = df.drop(['score'], axis = 1)
    df_y = df.filter(['score'], axis = 1)
    
    ## select initial training data using Kennard-Stone algorithm
    k = 20
    init_idx, _ = sampling.kennardstonealgorithm(df_X, k)
    print('finished initial sampling')

    # use the sample_idx to select initial sampling from df_X
    df_X_init = df_X.iloc[init_idx]
    df_y_init = df_y.iloc[init_idx]
    
    ############### Active learning ##############################
    iteration_num = 18
    number_periter = 5
    iteration_step = list(np.arange(k,k+(1+iteration_num)*number_periter,number_periter))
    
    df_X_labeled = df_X_init.copy()
    df_y_labeled = df_y_init.copy()
    
    for iteration in range(iteration_num):
        df_X_unlabeled = df_X.drop(df_X_labeled.index, axis = 'index')
        df_X_AL = AL.minibatch_AL(pool = df_X_unlabeled, X_label = df_X_labeled, y_label = df_y_labeled, \
                                  model = cf, numb_periter = number_periter)
        df_X_AL = df_X_AL.drop(['uncertainty'], axis = 1)
        df_X_labeled = pd.concat([df_X_labeled, df_X_AL])
        df_y_labeled = df_y.filter(df_X_labeled.index, axis = 'index')
    
    df_labeled = pd.concat([df_X_labeled, df_y_labeled], axis = 1)
    dataset_labeled_dict[noise] = df_labeled # save the AL labeled dataframe
    print('finished active learning')
    ################################################################
                           
    ############### Active learning metrics measurement ##############################  
    # Calculate cross validataion score (f1, recall, precision)
    df_metric = pd.DataFrame(index = np.arange(len(iteration_step)), \
                          columns = ['f1_mean', 'f1_std', 'precision_mean', 'precision_std', 'recall_mean', 'recall_std'])

    for i in range(len(iteration_step)):
        # for initial sampling, we have one class 1, so when we do cross_val, 
        # we will face the situation that no class 1 in the training set, that is not gonna work for SVM or GPC
        if ((classifier == 'SVM_best') or (classifier == 'GPC_best') or (classifier == 'GPC_matern')) & (i == 0): 
            f1 = 0
            precision = 0
            recall = 0
        else:
            f1 = cross_val_score(cf, np.array(df_X_labeled[:iteration_step[i]]), \
                                 np.array(df_y_labeled[:iteration_step[i]]).ravel(), cv=cv, \
                                 scoring = 'f1')
            
            precision = cross_val_score(cf, np.array(df_X_labeled[:iteration_step[i]]), \
                                        np.array(df_y_labeled[:iteration_step[i]]).ravel(), cv=cv, \
                                       scoring = 'precision')
            
            recall = cross_val_score(cf, np.array(df_X_labeled[:iteration_step[i]]), \
                                    np.array(df_y_labeled[:iteration_step[i]]).ravel(), cv=cv, \
                                    scoring = 'recall')

            df_metric['f1_mean'].loc[i] = np.mean(f1)
            df_metric['f1_std'].loc[i] = np.std(f1)
            df_metric['precision_mean'].loc[i] = np.mean(precision)
            df_metric['precision_std'].loc[i] = np.std(precision)
            df_metric['recall_mean'].loc[i] = np.mean(recall)
            df_metric['recall_std'].loc[i] = np.std(recall)
    
    model_metrics_lst[noise] = df_metric
    print('finished cross validataion score')

    
    # Calculate metrics of the whole dataset
    f1_pool = []
    precision_pool = []
    recall_pool = []
    
    df_metric_pool = pd.DataFrame(index = np.arange(len(iteration_step)), \
                                  columns = ['f1', 'precision', 'recall'])
    
    for i in range(len(iteration_step)):   
        cf.fit(np.array(df_X_labeled[:iteration_step[i]]), np.array(df_y_labeled[:iteration_step[i]]).ravel())
        df_y_predict = cf.predict(df_X)
        
        f1_pool.append(f1_score(np.array(df_y).ravel(), df_y_predict.ravel(), pos_label = 1))
        precision_pool.append(precision_score(np.array(df_y).ravel(), df_y_predict.ravel(), pos_label = 1))
        recall_pool.append(recall_score(np.array(df_y).ravel(), df_y_predict.ravel(), pos_label = 1))
        
    df_metric_pool['f1'] = f1_pool
    df_metric_pool['precision'] = precision_pool
    df_metric_pool['recall'] = recall_pool
    
    print('finished prediction metrics of the whole dataset')


    model_metrics_pool_lst[noise] = df_metric_pool
    ###################################################################################
        
            
############################ save the calculations ###########################################
save_obj(model_metrics_lst, 'Artificial data_AL labeled_CV metrics_circle_6D_GPC_matern')
save_obj(model_metrics_pool_lst, 'Artificial data_pool_true metrics_circle_6D_GPC_matern')

### tSNE conversion ###

In [None]:
dataset_dict = load_obj('Artificial data_dataset')
dataset_tSNE_dict = {}

for noise in tqdm(dataset_dict.keys()):
    df = dataset_dict[noise]
    df_X = df.drop(['score'], axis = 1)
    df_y = df.filter(['score'], axis = 1)
    
    ################### tSNE transformation of the df #################################  
    # using tSNE to transform the artificial data to 2D
    from sklearn.manifold import TSNE
    X_tsne = TSNE(n_components=2, perplexity = 50, random_state = 42).fit_transform(np.array(df_X)) 
    df_X_tSNE = pd.DataFrame(columns = ['dim 1', 'dim 2'], data = X_tsne)
    df_tSNE = pd.concat([df_X_tSNE, df_y], axis = 1)
    dataset_tSNE_dict[noise] = df_tSNE
    ###################################################################################
    
save_obj(dataset_tSNE_dict, 'Artificial data_tSNE dataset')