# Pre-trained Machine Learning models using Active Learning for HealthCLEF dataset 

In [None]:
%load_ext autoreload
%autoreload 2

import os 
import pandas as pd
import json 
import numpy as np
from shutil import copyfile
import random
import pickle

from utils import *

# random seed for random shuffle 
seed = 100
random.seed(seed)

## Choose parameters

In [None]:
DATASET_DIR = ''  # absolute path where dataset was downloaded and unzipped

chosen_model = 'LR' # choose 'LR', 'MLP', 'SVM_linear', 'SVM_rbf' or 'RF' 

chosen_representation = 'GLOVE' # choose 'GLOVE', 'W2VEC', 'BERT', 'BioBERT' or 'TF-IDF' 

dict_embeddings = return_embeddings_clef(chosen_representation, DATASET_DIR)

In [None]:
matrix_doc_df = pd.read_csv('{}/datasets/CLEF_dataset.csv'.format(DATASET_DIR), sep='|')
matrix_doc_df = matrix_doc_df[['topic_id', 'pid', 'rel']]

list_documents = list(dict_embeddings.keys())
matrix_doc_df = matrix_doc_df[matrix_doc_df.pid.isin(list_documents)]
matrices = list(matrix_doc_df.topic_id.unique())

recalls10= []
recalls20 = []
recalls30 = []

precisions10 = []
precisions20 = []
precisions30 = []

avg_precisions = []
last_rels = []


for i, m in enumerate(matrices):
    
    print(i, end='\r')
    
    matrix = matrix_doc_df[matrix_doc_df.topic_id == m]

    vectors = [ ]

    for doc_id in matrix.pid:
        
        vectors.append(dict_embeddings[str(doc_id)])

    labels = [int(x) for x in matrix.rel]

    X, y, X_test, y_test = dataset_preprocesing(vectors, labels)

    machine_learning_model = machine_learning_model_clef(chosen_model, m, chosen_representation, DATASET_DIR)
    
    # predict score 
    score = [x[1] for x in machine_learning_model.predict_proba(X_test)]
    
    recall10= [x for x,y in Sort(list(zip(y_test,score)))][0:10].count(1)/y_test.count(1)
    recall20= [x for x,y in Sort(list(zip(y_test,score)))][0:20].count(1)/y_test.count(1)
    recall30= [x for x,y in Sort(list(zip(y_test,score)))][0:30].count(1)/y_test.count(1)

    precision10 = [x for x,y in Sort(list(zip(y_test,score)))][0:10].count(1)/10
    precision20 = [x for x,y in Sort(list(zip(y_test,score)))][0:20].count(1)/20
    precision30 = [x for x,y in Sort(list(zip(y_test,score)))][0:30].count(1)/30

    avg_precisions.append(average_precision([x for x,y in Sort(list(zip(y_test,score)))]))
    last_rels.append(last_rel([x for x,y in Sort(list(zip(y_test,score)))]))

    recalls10.append(recall10)
    recalls20.append(recall20)
    recalls30.append(recall30)

    precisions10.append(precision10)
    precisions20.append(precision20)
    precisions30.append(precision30)
    

In [None]:
print('r@10: {}'.format(sum(recalls10)/len(recalls10)))
print('r@20: {}'.format(sum(recalls20)/len(recalls20)))
print('r@30: {}'.format(sum(recalls30)/len(recalls30)))

print('p@10: {}'.format(sum(precisions10)/len(precisions10)))
print('p@20: {}'.format(sum(precisions20)/len(precisions20)))
print('p@30: {}'.format(sum(precisions30)/len(precisions30)))

print('map: {}'.format(sum(avg_precisions)/len(avg_precisions)))
print('lastrel: {}'.format(sum(last_rels)/len(last_rels)))
