# Training Machine Learning Models from scratch using Active Learning on HealthCLEF dataset 

In [43]:
%load_ext autoreload
%autoreload 2

import json 
import numpy as np
import random 
import re
import math
import pandas as pd
import pickle 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pickle 
from libact.models import LogisticRegression
from libact.models.svm import SVM
from libact.models import SklearnProbaAdapter
from libact.labelers import IdealLabeler
from libact.query_strategies import RandomSampling, UncertaintySampling
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models.sklearn_adapter import SklearnAdapter
from sklearn.neural_network import MLPClassifier

from utils import * 

# random seed for random shuffle 
seed = 100
random.seed(seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## load document embeddings pre-trained/pre-calculated for this dataset
- Insert DATASET_DIR path where all embeddings and dataset were downloaded. 
- Choose desired embedding BERT, BioBERT, Word2Vec, GloVe or TF-IDF. 
- Comment/Uncomment the desired embedding dictionary for active learning.  

In [45]:
DATASET_DIR = ''  # absolute path where dataset was downloaded and unzipped

chosen_representation = 'GLOVE' # choose 'GLOVE', 'W2VEC', 'BERT', 'BioBERT' or 'TF-IDF' 

dict_embeddings, embedding_dim = return_embeddings_clef(chosen_representation, DATASET_DIR)

In [None]:
dict_embeddings.keys()

## metaparameters for active learning setup 

In [47]:
n_labeled = 5 # number of initial labeled documents  
quota = 100 # total documents asked to the oracle 
batch = int(quota/10) # number of documents asked to the oracle on each iteration  

## load clef dataset
this dataset contains relations between topic ids that are medical questions and relevant/non relevant documents for each topic id. <br>
We assume the HealthCLEF dataset has been downloaded in the same folder as the script. 

In [48]:
# Medical questions clef 
matrix_doc_df = pd.read_csv('{}/datasets/CLEF_dataset.csv'.format(DATASET_DIR), sep='|')
list_documents = list(dict_embeddings.keys())
matrix_doc_df = matrix_doc_df[matrix_doc_df.pid.isin(list_documents)]
matrices = list(matrix_doc_df.topic_id.unique())
matrices

['CD010339',
 'CD011548',
 'CD011549',
 'CD009323',
 'CD009591',
 'CD009519',
 'CD010409',
 'CD009185',
 'CD009944',
 'CD012019',
 'CD008686',
 'CD009372',
 'CD008782',
 'CD010386',
 'CD010632',
 'CD010783',
 'CD011145',
 'CD010633',
 'CD010896',
 'CD010775',
 'CD009786',
 'CD011134',
 'CD010542',
 'CD008691',
 'CD009020',
 'CD007427',
 'CD010023',
 'CD008643',
 'CD008760',
 'CD009647',
 'CD009925',
 'CD011975',
 'CD011984',
 'CD008054',
 'CD007431',
 'CD010173',
 'CD010276',
 'CD007394',
 'CD009135',
 'CD009593',
 'CD010438',
 'CD010705',
 'CD010771',
 'CD008803',
 'CD009551',
 'CD010653',
 'CD008081',
 'CD010772',
 'CD010860']

## start active learning iterations

In [None]:
# define models 
models = ['SVM_linear','RF', 'LR', 'MLP', 'SVM_rbf' ] 


for model in models:
    
    final_results = {}
    
    # initialize dictionary with results
    for m in matrices: 
        final_results[m] = {'label':[], 'prediction': []}
    
    machine_learning_model = return_model(model)
    
    for m in matrices:
        
        matriz = matrix_doc_df[matrix_doc_df.topic_id == m]

        # document vectors/embeddings 
        vectors = [ ]

        for doc_id in matriz.pid:
            vectors.append(dict_embeddings[str(doc_id)])

        labels = [int(x) for x in matriz.rel]

        # shuffle  
        c = list(zip(vectors, labels))
        random.shuffle(c)
        vectors, labels = zip(*c)

        # get dataset with observations without tags (None) and all tagged for ground-truth
        X, y,X_test, y_test, ds_unlabeled, fully_labeled_ds = dataset_preprocesing(vectors,labels, n_labeled)

        # the IdealLabeler takes labels from ground truth 
        lbr = IdealLabeler(fully_labeled_ds)

        # select active learning strategy (uncertainty sampling or random sampling) 
        qs = UncertaintySampling(ds_unlabeled, model=machine_learning_model)
        #qs = RandomSampling(ds_unlabeled)

        for i in range(quota):
            
            ask_id = qs.make_query()

            X, labels_new = zip(*ds_unlabeled.data)
            lb = lbr.label(X[ask_id])
            ds_unlabeled.update(ask_id, lb)
            machine_learning_model.train(ds_unlabeled)

            y_prob = [x[1] for x in machine_learning_model.predict_proba(X_test)]
       
            # after 10 documents asked to the oracle store results 
            if i%10 == 0:
                final_results[m]['label'].append(y_test)
                final_results[m]['prediction'].append(y_prob)
                
    # save model 
    print('saving model ...')
    pickle.dump(machine_learning_model, open('{}_{}.sav'.format(model, chosen_representation), 'wb'))

   
     # save results 
    print('saving results...')
    with open('results_{}_{}.json'.format(model, chosen_representation), 'w') as fp:
        json.dump(final_results,fp)