# Test retrieval from BM25
This notebook uses BM25 to retrieve a series of candidates usable for next models or even, snippet retrieval directly.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import copy
import logging
from pprint import pprint

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

from src.cemb_bm25 import centroid_retrieval
from src.elastic_search_utils.elastic_utils import load_json, save_json

In [2]:
!which python

/datasets/anaconda3/envs/tf2.8/bin/python


## Params

In [3]:
BM25_QUESTIONS = '/datasets/johan_tests_original_format/test_docs_10b-testset4.json'

In [4]:
LOADING_FOLDER = '/datasets/johan_tests_original_format/merged_training_docs'

LOADING_ABSTRACT_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_abstract_model_10b_train.bin'
LOADING_TITLE_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_title_model_10b_train.bin'
LOADING_QUESTION_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_question_model_10b_train.bin'

## Saving directions

In [5]:
SAVING_FOLDER = '/datasets/johan_tests_original_format/merged_training_docs'

SAVING_ORIGINAL_PATH = f'{SAVING_FOLDER}/test_original_10b-testset4.json'
SAVING_TOKENS_PATH = f'{SAVING_FOLDER}/test_tokens_10b-testset4.json'
SAVING_ENTITY_PATH = f'{SAVING_FOLDER}/test_entity_10b-testset4.json'

## Constants

In [6]:
questions = load_json(BM25_QUESTIONS)

## Extracting unique document info
Only documents with abstract will be kept.

One dict cleaned-tokenized, another only cleaned.

In [7]:
unique_docs = centroid_retrieval.extract_unique_doc_info(
    questions=questions['questions']
)

Extracting unique doc info: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 34345.13it/s]


## Extracting tokens for valid documents

In [8]:
tokenized_unique_docs = centroid_retrieval.docs_to_tokens(
    unique_docs=unique_docs,
    n_jobs=16
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    4.7s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Batch computation too fast (0.1810s.) Setting batch_size=2.
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Batch computation too fast (0.0286s.) Setting batch_size=4.
[Parallel(n_jobs=16)]: Done  91 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done 120 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Batch computation too fast (0.0346s.) Setting batch_size=8.
[Parallel(n_jobs=16)]: Done 160 tasks      | elapsed:    4.9s
[Parallel(n_jobs=16)]: Done 225 tasks      | elapsed:    4.9s
[Parallel(n_jobs=16)]: Batch computation too fast (0.0

## Extracting entities for valid documents

In [9]:
entitized_unique_docs = centroid_retrieval.docs_to_entities(
    unique_docs=unique_docs,
    n_jobs=16
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Batch computation too fast (0.0692s.) Setting batch_size=2.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done  48 tasks      | elapsed:    0.8s
[Parallel(n_jobs=16)]: Done  74 tasks      | elapsed:    1.1s
[Parallel(n_jobs=16)]: Done 100 tasks      | elapsed:    1.4s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:    1.7s
[Parallel(n_jobs=16)]: Done 160 tasks      | elapsed:    2.0s
[Parallel(n_jobs=16)]: Done 194 tasks      | elapsed:    2.4s
[Parallel(n_jobs=16)]: Done 228 tasks      | elapsed:    2.9s
[Parallel(n_jobs=16)]: Done 266 tasks      | elapsed:    3.4s
[Parallel(n_jobs=16)]: Done 304 tasks      | elapsed:    3.9s
[Parallel(n_jobs=16)]: Done 346 tasks      | elapsed:    4.4s
[Parallel(n_jobs=16)]: Done 388 ta

## Tokenize question body and replace documents with tokenized documents

In [10]:
tokenized_questions, question_solving_doc_ids = centroid_retrieval.select_questions_useful_documents(
    questions=questions['questions'],
    unique_docs=tokenized_unique_docs
)

Selecting useful documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 4331.47it/s]


In [11]:
tokenized_questions = {'questions': tokenized_questions}

## (Graph entity dict) Tokenize question body and replace documents with entity extracted documents

In [12]:
graph_questions, question_solving_doc_ids = centroid_retrieval.select_questions_useful_documents(
    questions=questions['questions'],
    unique_docs=entitized_unique_docs
)

Selecting useful documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 4224.44it/s]


In [13]:
graph_questions = {'questions': graph_questions}

## Extracting unique tokenized abstracts and titles for word2vec

In [14]:
unique_abstract_tokens, unique_title_tokens = centroid_retrieval.extract_unique_titles_and_abstracts(
    tokenized_unique_docs=tokenized_unique_docs,
    question_solving_doc_ids=question_solving_doc_ids
)

In [15]:
len(unique_abstract_tokens), len(unique_title_tokens)

(7098, 7098)

In [16]:
len(list(unique_abstract_tokens.values())[0]), len(list(unique_title_tokens.values())[0])

(165, 11)

## Extracting unique tokenized questions for word2vec

In [17]:
unique_question_tokens = centroid_retrieval.extract_unique_questions(
    tokenized_questions['questions']
)

In [18]:
len(unique_question_tokens)

90

In [19]:
len(list(unique_question_tokens.values())[0])

10

## Sorting scores in descending fashion
### Making question_scores artificial dict

In [20]:
question_scores = {
    question['id']: {
        doc['id']: doc['score']
        for doc in question['documents']
    }
    for question in tokenized_questions['questions']
}

### Original dict (for Andres model)

In [21]:
centroid_retrieval.update_question_scores_from_raw_data(
    raw_questions=questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 15588.99it/s]


### Tokenized dict

In [22]:
centroid_retrieval.update_question_scores(
    raw_questions=tokenized_questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 26201.66it/s]


### Graph dict (only tokens and entities)

In [23]:
centroid_retrieval.update_question_scores(
    raw_questions=graph_questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 19111.35it/s]


## Saving into disk

In [24]:
save_json(questions, SAVING_ORIGINAL_PATH)

In [25]:
save_json(tokenized_questions, SAVING_TOKENS_PATH)

In [26]:
save_json(graph_questions, SAVING_ENTITY_PATH)

In [27]:
questions['questions'][0]['documents'][0].keys()

dict_keys(['id', 'score', 'title', 'abstract', 'mesh_terms', 'origin'])