In [1]:
import tempfile
import sys
import subprocess
import shutil
from os.path import join
from elasticsearch import Elasticsearch, helpers
from nir.utils import create_filter_query_function, change_bm25_parameters
from utils import *
import pickle
from mmnrm.dataset import TrainCollectionV2, TrainSnippetsCollectionV2, TestCollectionV2
from mmnrm.evaluation import BioASQ_Evaluator

import json

es = Elasticsearch(["http://193.136.175.98:8125"])

index_name = "bioasq_9b"


In [2]:
def load_data(batch_index, training_data_path = "training9b_wDates.json"):
    
    queries = load_queries(training_data_path, maps=[("body","query")])

    test_8b = []

    for i in range(1,6):
        id_sets = set()
        with open(f"yearly_data/8B{i}_golden.json","r") as f:
            for q in json.load(f)["questions"]:
                id_sets.add(q["id"])

        test_8b.append(id_sets)

    print(sum([len(x) for x in test_8b]))

    queries_ids_sets = { x["id"] for x in queries }
    train_ids = queries_ids_sets - test_8b[batch_index]
    validations_ids = test_8b[batch_index]

    train_data = subset_byId(queries, train_ids)
    validation_data = subset_byId(queries, validations_ids)

    convert_to_trainable_gs = lambda x: { k:{1:v}for k,v in x.items()}

    train_data_queries, train_data_gs = separate_queries_goldstandard(train_data, additional_keys=["limit_date"])
    train_data_gs = convert_to_trainable_gs(train_data_gs)
    print(len(train_data))
    validation_data_queries, _ = separate_queries_goldstandard(validation_data, additional_keys=["limit_date"])
    
    validation_data_gs = list(map(lambda x:{"id":x["id"], 
                                            "query":x["query"], 
                                            "documents":[y.split("/")[-1] for y in x["documents"]],
                                            "limit_date":x["limit_date"]},
                                           validation_data))
    
    print(len(validation_data))
    
    return train_data_queries, train_data_gs, validation_data_queries, validation_data_gs

def load_training_data_WSnippets(batch_index, training_data_path = "training9b_wDates.json"):
    
    def get_snippets_by_docid(snippets, doc_id):
        return list(filter(lambda x:x["document"].split("/")[-1]==doc_id and (x["offsetInEndSection"]-x["offsetInBeginSection"])>0 , snippets))
    
    def separate_queries_goldstandard(queires, additional_keys=[]):
        clean_queires = []
        gs = {}
        additional_keys = ["id", "query"] + additional_keys
        
        total_empty = 0
        total_empty_query = 0
        
        for x in queires:
            gs[x["id"]] = {}
            for doc_id in list(map(lambda y : y.split("/")[-1], x["documents"])):
                
                snippets_of_docid = get_snippets_by_docid(x["snippets"], doc_id)
                
                if len(snippets_of_docid)>0:
                    gs[x["id"]][doc_id] = snippets_of_docid
                else:
                    total_empty += 1
                
            
            if len(gs[x["id"]])>0:
                clean_queires.append({k:x[k] for k in additional_keys})
            else:
                # remove this query
                total_empty_query += 1
                del gs[x["id"]]
        
        print("Num empty queries",total_empty_query)
        print("Num docs without snippets",total_empty)
        
        
        return clean_queires, gs
    
    
    queries = load_queries(training_data_path, maps=[("body","query")])

    test_8b = []

    for i in range(1,6):
        id_sets = set()
        with open(f"yearly_data/8B{i}_golden.json","r") as f:
            for q in json.load(f)["questions"]:
                id_sets.add(q["id"])

        test_8b.append(id_sets)

    print(sum([len(x) for x in test_8b]))

    queries_ids_sets = { x["id"] for x in queries }
    train_ids = queries_ids_sets - test_8b[batch_index]

    train_data = subset_byId(queries, train_ids)

    convert_to_trainable_gs = lambda x: { k:{1:v}for k,v in x.items()}

    train_data_queries, train_data_gs = separate_queries_goldstandard(train_data, additional_keys=["limit_date"])    
    
    return train_data_queries, train_data_gs

In [3]:
def build_data_collections(train_data_queries, train_data_gs, validation_data_queries, validation_data_gs, K1, BETA, TOP_N):
    
    print(K1,BETA,TOP_N)
    
    query_results = execute_search(es, train_data_queries, TOP_N, index_name, k1=K1, b=BETA)

    t_collection = TrainCollectionV2(train_data_queries, 
                                   train_data_gs, 
                                   query_results, 
                                   use_relevance_groups=False)\
                            .batch_size(32)

    

    ## VALIDATION DATA

    query_results = execute_search(es, validation_data_queries, TOP_N, index_name, k1=K1, b=BETA)


    evaluator = BioASQ_Evaluator(validation_data_gs)

    validation_collection = TestCollectionV2(validation_data_queries, 
                                               query_results,
                                               evaluator)\
                                        .batch_size(32)

    return t_collection, validation_collection

# Round 1

In [5]:
# make a BM25 search
batch_index = 0


experiments = [(0.9, 0.4, 100), (0.6, 0.4, 250)]

for k1,beta,top_n in experiments:
    
    train_queries, train_gs, validation_queries, validation_gs = load_data(batch_index)
    
    t_collection, v_collection = build_data_collections(train_queries, 
                                                        train_gs, 
                                                        validation_queries, 
                                                        validation_gs, 
                                                        k1, 
                                                        beta, 
                                                        top_n)
    
    t_collection.save(f"_del_training_batch_0{batch_index+1}_{top_n}")
    v_collection.save(f"_del_validation_batch_0{batch_index+1}_{top_n}")

500
3643
100
Setting the k1 and b for BM25
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 169
Mean number of relevance type(0) in the queries of the goldstandard sub set: 242.0714540907467
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 7.928545909253304
Sub Collection size 544079
Number of skipped question, due to lack of true positives 844
Setting the k1 and b for BM25
Running query: 80

# Round 2

In [4]:
# make a BM25 search
batch_index = 1


experiments = [(0.4, 0.4, 250), (0.9, 0.53, 100), (0.3, 0.67, 100)]

for k1,beta,top_n in experiments:
    
    train_queries, train_gs, validation_queries, validation_gs = load_data(batch_index)
    
    t_collection, v_collection = build_data_collections(train_queries, 
                                                        train_gs, 
                                                        validation_queries, 
                                                        validation_gs, 
                                                        k1, 
                                                        beta, 
                                                        top_n)
    
    t_collection.save(f"training_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")
    v_collection.save(f"validation_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")

500
3643
100
0.4 0.4 250
Setting the k1 and b for BM25
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 161
Mean number of relevance type(0) in the queries of the goldstandard sub set: 242.3090856814261
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 7.690914318573893
Sub Collection size 665612
Number of skipped question, due to lack of true positives 165
Setting the k1 and b for BM25
500ning query: 80
3643
100
0.9 0.53 100
Setting the k1 and b for BM25
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 43
Mean number of relevance type(0) in the queries of the goldstandard sub set: 93.29015240328253
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 6.709847596717467
Sub Collection size 280215
Number of skipped 

# Round 3

In [4]:
# make a BM25 search
batch_index = 2


experiments = [(0.4, 0.14, 250), (0.9, 0.09, 100), (0.5, 0.79, 100)]

for k1,beta,top_n in experiments:
    
    train_queries, train_gs, validation_queries, validation_gs = load_data(batch_index)
    
    t_collection, v_collection = build_data_collections(train_queries, 
                                                        train_gs, 
                                                        validation_queries, 
                                                        validation_gs, 
                                                        k1, 
                                                        beta, 
                                                        top_n)
    
    t_collection.save(f"training_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")
    v_collection.save(f"validation_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")

500
3643
100
0.4 0.14 250
Setting the k1 and b for BM25
The inquery limit_date will be used
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 160
Mean number of relevance type(0) in the queries of the goldstandard sub set: 242.31793400286944
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 7.682065997130559
Sub Collection size 660164
Number of skipped question, due to lack of true positives 158
Setting the k1 and b for BM25
The inquery limit_date will be used
500ning query: 80
3643
100
0.9 0.09 100
Setting the k1 and b for BM25
The inquery limit_date will be used
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 44
Mean number of relevance type(0) in the queries of the goldstandard sub set: 93.27347659699736
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(

# Round 4

In [4]:
# make a BM25 search
batch_index = 3


experiments = [(0.6, 0.18, 250), (0.9, 0.26, 100), (0.6, 0.69, 100)]

for k1,beta,top_n in experiments:
    
    train_queries, train_gs, validation_queries, validation_gs = load_data(batch_index)
    
    t_collection, v_collection = build_data_collections(train_queries, 
                                                        train_gs, 
                                                        validation_queries, 
                                                        validation_gs, 
                                                        k1, 
                                                        beta, 
                                                        top_n)
    
    t_collection.save(f"training_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")
    v_collection.save(f"validation_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")

500
3643
100
0.6 0.18 250
Setting the k1 and b for BM25
The inquery limit_date will be used
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 161
Mean number of relevance type(0) in the queries of the goldstandard sub set: 242.33486107132626
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 7.665138928673732
Sub Collection size 660263
Number of skipped question, due to lack of true positives 152
Setting the k1 and b for BM25
The inquery limit_date will be used
500ning query: 80
3643
100
0.9 0.26 100
Setting the k1 and b for BM25
The inquery limit_date will be used
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 43
Mean number of relevance type(0) in the queries of the goldstandard sub set: 93.24625330590655
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(

# Round 5

In [5]:
# make a BM25 search
batch_index = 4


experiments = [(0.6, 0.51, 250), (0.5, 0.15, 100), (0.4, 0.44, 100)]

for k1,beta,top_n in experiments:
    
    train_queries, train_gs, validation_queries, validation_gs = load_data(batch_index)
    
    t_collection, v_collection = build_data_collections(train_queries, 
                                                        train_gs, 
                                                        validation_queries, 
                                                        validation_gs, 
                                                        k1, 
                                                        beta, 
                                                        top_n)
    
    t_collection.save(f"training_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")
    v_collection.save(f"validation_batch_0{batch_index+1}_{k1}_{beta}_{top_n}")

500
3643
100
0.6 0.51 250
Setting the k1 and b for BM25
The inquery limit_date will be used
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 164
Mean number of relevance type(0) in the queries of the goldstandard sub set: 242.29238258877433
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 7.707617411225659
Sub Collection size 668526
Number of skipped question, due to lack of true positives 151
Setting the k1 and b for BM25
The inquery limit_date will be used
500ning query: 80
3643
100
0.5 0.15 100
Setting the k1 and b for BM25
The inquery limit_date will be used
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 44
Mean number of relevance type(0) in the queries of the goldstandard sub set: 93.12040035325288
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(

In [4]:
# snippets training data
batch_index = 4 # round 4

train_queries, train_gs = load_training_data_WSnippets(batch_index)




500
Num empty queries 0
Num docs without snippets 6869


In [5]:
K1,BETA,TOP_N = (0.6, 0.51, 250)

query_results = execute_search(es, train_queries, TOP_N, index_name, k1=K1, b=BETA)
print("build")
t_collection = TrainSnippetsCollectionV2(train_queries, 
                               train_gs, 
                               query_results,
                               use_soft_label=False,
                               use_relevance_groups=False)\
                        .batch_size(32)

Setting the k1 and b for BM25
The inquery limit_date will be used
buildng query: 3640
index to remove length 183
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 192
Mean number of relevance type(0) in the queries of the goldstandard sub set: 243.62687861271675
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 6.373121387283237
Sub Collection size 663270
Number of skipped question, due to lack of true positives 183


In [6]:
t_collection.save(f"joint_training_hardlabel_batch_0{batch_index+1}_{K1}_{BETA}_{TOP_N}")


## Random code 

In [3]:
TOP_N = 25

query_results = execute_search(es, validation_data_queries, TOP_N, index_name, k1=0.6, b=0.4)

validation_data_gs = list(map(lambda x:{"id":x["id"], 
                                        "query":x["query"], 
                                        "documents":[y.split("/")[-1] for y in x["documents"]],
                                        "limit_date":x["limit_date"]},
                                       validation_data))

evaluator = BioASQ_Evaluator(validation_data_gs)

validation_collection = TestCollectionV2(validation_data_queries, 
                                           query_results,
                                           evaluator)\
                                    .batch_size(32)

validation_collection.save("validation_data_batch_01_"+str(TOP_N))

Setting the k1 and b for BM25
Running query: 80

In [None]:
TOP_N = 25

query_results = execute_search(es, validation_data_queries, TOP_N, index_name, k1=0.6, b=0.4)

validation_data_gs = list(map(lambda x:{"id":x["id"], 
                                        "query":x["query"], 
                                        "documents":[y.split("/")[-1] for y in x["documents"]],
                                        "limit_date":x["limit_date"]},
                                       validation_data))

evaluator = BioASQ_Evaluator(validation_data_gs)

validation_collection = TestCollectionV2(validation_data_queries, 
                                           query_results,
                                           evaluator)\
                                    .batch_size(32)

validation_collection.save("validation_data_batch_01_"+str(TOP_N))

In [3]:
TOP_N = 50

train_data_queries = train_data_queries[:32]

query_results = execute_search(es, train_data_queries, TOP_N, index_name, k1=0.6, b=0.4)

t_collection = TrainCollectionV2(train_data_queries, 
                               train_data_gs, 
                               query_results, 
                               use_relevance_groups=False)\
                        .batch_size(32)

t_collection.save("training_batch_01_"+str(TOP_N))

Setting the k1 and b for BM25
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 32
Mean number of relevance type(0) in the queries of the goldstandard sub set: 43.758620689655174
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 6.241379310344827
Sub Collection size 1450
Number of skipped question, due to lack of true positives 3
