# Test retrieval from BM25
This notebook uses BM25 to retrieve a series of candidates usable for next models or even, snippet retrieval directly.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import copy
import logging
from pprint import pprint

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

from src.cemb_bm25 import centroid_retrieval
from src.elastic_search_utils.elastic_utils import load_json, save_json

In [2]:
!which python

/datasets/anaconda3/envs/tf2.8/bin/python


## Params

In [9]:
BM25_QUESTIONS = '/datasets/johan_tests_original_format/test_docs_10b-testset4.json'

In [10]:
LOADING_FOLDER = '/datasets/johan_tests_original_format/merged_training_docs'

LOADING_ABSTRACT_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_abstract_model_10b_train.bin'
LOADING_TITLE_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_title_model_10b_train.bin'
LOADING_QUESTION_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_question_model_10b_train.bin'

## Saving directions

In [13]:
SAVING_FOLDER = '/datasets/johan_tests_original_format/merged_training_docs'

SAVING_ORIGINAL_PATH = f'{SAVING_FOLDER}/test_original_10b-testset4.json'
SAVING_TOKENS_PATH = f'{SAVING_FOLDER}/test_tokens_10b-testset4.json'
SAVING_ENTITY_PATH = f'{SAVING_FOLDER}/test_entity_10b-testset4.json'

## Constants

In [14]:
questions = load_json(BM25_QUESTIONS)

## Extracting unique document info
Only documents with abstract will be kept.

One dict cleaned-tokenized, another only cleaned.

In [15]:
unique_docs = centroid_retrieval.extract_unique_doc_info(
    questions=questions['questions']
)

Extracting unique doc info: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 44673.06it/s]


## Extracting tokens for valid documents

In [16]:
tokenized_unique_docs = centroid_retrieval.docs_to_tokens(
    unique_docs=unique_docs,
    n_jobs=16
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Batch computation too fast (0.1680s.) Setting batch_size=2.
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Batch computation too fast (0.0116s.) Setting batch_size=4.
[Parallel(n_jobs=16)]: Done  90 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done 119 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Batch computation too fast (0.0408s.) Setting batch_size=8.
[Parallel(n_jobs=16)]: Done 163 tasks      | elapsed:    4.9s
[Parallel(n_jobs=16)]: Done 219 tasks      | elapsed:    4.9s
[Parallel(n_jobs=16)]: Batch computation too fast (0.0

## Extracting entities for valid documents

In [18]:
entitized_unique_docs = centroid_retrieval.docs_to_entities(
    unique_docs=unique_docs,
    n_jobs=16
)

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    5.0s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    5.2s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    5.4s
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    5.6s
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:    5.7s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:    5.9s
[Parallel(n_jobs=16)]: Done  81 tasks      | elapsed:    6.1s
[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed:    6.2s
[Parallel(n_jobs=16)]: Done 113 tasks      | elapsed:    6.4s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:    6.6s
[Parallel(n_jobs=16)]: Done 149 tasks      | elapsed:    6.8s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    7.0s
[Parallel(n_jobs=16)]: Done 189 tasks      | elapsed:    7.2s
[Parallel(n_jobs=16)]: Done 210 tasks      | elapsed:    7.5s
[Parallel(n_jobs=16)]: Done 233 tasks      | elapsed:  

## Tokenize question body and replace documents with tokenized documents

In [19]:
tokenized_questions, question_solving_doc_ids = centroid_retrieval.select_questions_useful_documents(
    questions=questions['questions'],
    unique_docs=tokenized_unique_docs
)

Selecting useful documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 3460.77it/s]


In [20]:
tokenized_questions = {'questions': tokenized_questions}

## (Graph entity dict) Tokenize question body and replace documents with entity extracted documents

In [21]:
graph_questions, question_solving_doc_ids = centroid_retrieval.select_questions_useful_documents(
    questions=questions['questions'],
    unique_docs=entitized_unique_docs
)

Selecting useful documents: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 4402.24it/s]


In [22]:
graph_questions = {'questions': graph_questions}

## Extracting unique tokenized abstracts and titles for word2vec

In [23]:
unique_abstract_tokens, unique_title_tokens = centroid_retrieval.extract_unique_titles_and_abstracts(
    tokenized_unique_docs=tokenized_unique_docs,
    question_solving_doc_ids=question_solving_doc_ids
)

In [24]:
len(unique_abstract_tokens), len(unique_title_tokens)

(7098, 7098)

In [25]:
len(list(unique_abstract_tokens.values())[0]), len(list(unique_title_tokens.values())[0])

(165, 11)

## Extracting unique tokenized questions for word2vec

In [26]:
unique_question_tokens = centroid_retrieval.extract_unique_questions(
    tokenized_questions['questions']
)

In [27]:
len(unique_question_tokens)

90

In [28]:
len(list(unique_question_tokens.values())[0])

10

## Sorting scores in descending fashion
### Making question_scores artificial dict

In [32]:
question_scores = {
    question['id']: {
        doc['id']: doc['score']
        for doc in question['documents']
    }
    for question in tokenized_questions['questions']
}

### Original dict (for Andres model)

In [33]:
centroid_retrieval.update_question_scores_from_raw_data(
    raw_questions=questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 17288.18it/s]


### Tokenized dict

In [34]:
centroid_retrieval.update_question_scores(
    raw_questions=tokenized_questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 26967.24it/s]


### Graph dict (only tokens and entities)

In [35]:
centroid_retrieval.update_question_scores(
    raw_questions=graph_questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 22633.85it/s]


## Saving into disk

In [37]:
save_json(questions, SAVING_ORIGINAL_PATH)

In [38]:
save_json(tokenized_questions, SAVING_TOKENS_PATH)

In [39]:
save_json(graph_questions, SAVING_ENTITY_PATH)