# Training Machine Learning Models from scratch using Active Learning on Epistemonikos dataset 

In [None]:
%load_ext autoreload
%autoreload 2

from utils import * 

import json 
import numpy as np
import random 
import re
import math
import pandas as pd
import pickle 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pickle 
from libact.models import LogisticRegression
from libact.models.svm import SVM
from libact.models import SklearnProbaAdapter
from libact.labelers import IdealLabeler
from libact.query_strategies import RandomSampling, UncertaintySampling
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models.sklearn_adapter import SklearnAdapter
from sklearn.neural_network import MLPClassifier

# random seed for random shuffle 
seed = 100
random.seed(seed)

## load document embeddings pre-trained/pre-calculated for this dataset
- Choose desired embedding BERT, BioBERT, Word2Vec, GloVe or TF-IDF.
- Comment/Uncomment the desired embedding dictionary for active learning.
- We assume the .json files has been downloaded in the same folder as the script. 

In [None]:
DATASET_DIR = ''  # absolute path where dataset was downloaded and unzipped

chosen_representation = 'TF-IDF' # choose 'GLOVE', 'W2VEC', 'BERT', 'BioBERT' or 'TF-IDF' 

dict_embeddings, embedding_dim = return_embeddings_episte(chosen_representation, DATASET_DIR)

## metaparameters for active learning setup 

In [None]:
n_labeled = 5 # number of initial labeled documents  
quota = 100 # total documents asked to the oracle 
batch = int(quota/10) # number of documents asked to the oracle on each iteration  

# minimum and max number of relevant documents per question for epistemonikos dataset 
max_docs = 2200
min_docs = 5


## load epistemonikos dataset
- this dataset contains only relevant documents for each medical questions. 
- for active learning framework we choose relevant documents for each medical question and sample non relevant documents not linked to these questions. 
- We assume the Epistemonikos dataset has been downloaded in the same folder as the script. 

In [None]:
matrix_doc_df = pd.read_csv('{}/datasets/Epistemonikos_dataset.csv'.format(DATASET_DIR), sep=';')
list_documents = list(dict_embeddings.keys())

# consider only documents in dict embeddings 
matrix_doc_df = matrix_doc_df[matrix_doc_df.document.isin(list_documents)]


## start active learning iterations 

In [None]:
import time 

# define models 
models = ['RF', 'LR', 'MLP', 'SVM_linear','SVM_rbf'] 

# filter medical questions depending on minimum and maximum documents per question chosen as meta-parameter 
matrices = [m for m in list(matrix_doc_df.matrix.unique()) if len(matrix_doc_df.loc[matrix_doc_df.matrix == m]) < max_docs and len(matrix_doc_df.loc[matrix_doc_df.matrix == m]) > min_docs]

for model in models:
    
    machine_learning_model = return_model(model)
        
    final_results = {}
     
    for m in matrices: 
        final_results[m] = {'label':[], 'prediction': []}
    
    
    for m in matrices:

        relevants = matrix_doc_df.loc[matrix_doc_df.matrix == m]

        # sample non relevant documents different from the actual question 
        non_relevants = matrix_doc_df.loc[matrix_doc_df.matrix != m].sample(
            n=len(relevants)*20)  # non-rel 20 times the q of relevant

        non_relevants.relevance = 0

        matrix_concat = pd.concat([relevants, non_relevants])

        # document vectors 
        vectors = [ ]

        for doc_id in matrix_concat.document:
            vectors.append(dict_embeddings[doc_id])

        labels = [int(x) for x in matrix_concat.relevance]

        # random shuffle data 
        c = list(zip(vectors, labels))
        random.shuffle(c)
        vectores, labels = zip(*c)

        # get dataset with observations without tags (None) and labeled ones for ground-truth
        X, y,X_test, y_test, ds_unlabeled, fully_labeled_ds = dataset_preprocesing(vectors, labels, n_labeled)

        # ideal labeler that pulls ground truth labels
        lbr = IdealLabeler(fully_labeled_ds)


        # choose active learning strategy 
        qs = UncertaintySampling(ds_unlabeled, model=machine_learning_model)
        #qs = RandomSampling(ds_unlabeled)


        for i in range(quota):                                     

            ask_id = qs.make_query()
            X, labels_new = zip(*ds_unlabeled.data)
            lb = lbr.label(X[ask_id])
            ds_unlabeled.update(ask_id, lb)
            machine_learning_model.train(ds_unlabeled)

            y_prob = [x[1] for x in machine_learning_model.predict_proba(X_test)]

            # after 10 iterations store prediction results and actual labels
            if i%10 == 0:
                final_results[m]['label'].append(y_test)
                final_results[m]['prediction'].append(y_prob)
                
                print(final_results)
                
    # save model 
    print('saving model ...')
    pickle.dump(machine_learning_model, open('{}{}.sav'.format(model, chosen_representation), 'wb'))

   
     # save results 
    print('saving results...')
    with open('results_{}_{}.json'.format(model, chosen_representation), 'w') as fp:
        json.dump(final_results,fp)
