# Test retrieval from centroid + BM25
This notebook uses the script for centroid retrieval parting from BM25 to retrieve a series of candidates usable for next models or even, snippet retrieval directly.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import logging
from pprint import pprint

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

from src.cemb_bm25 import centroid_retrieval
from src.elastic_search_utils.elastic_utils import load_json, save_json

## Params

In [2]:
BM25_QUESTIONS = '/datasets/johan_tests_original_format/test_docs_10b.json'

ABSTRACT_WEIGHT = 0.7
TITLE_WEIGHT = 0.3

In [3]:
LOADING_FOLDER = '/datasets/johan_tests_original_format_centroid/queried_training_docs'

LOADING_ABSTRACT_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_abstract_model_10b_train.bin'
LOADING_TITLE_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_title_model_10b_train.bin'
LOADING_QUESTION_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_question_model_10b_train.bin'

## Saving directions

In [4]:
SAVING_FOLDER = '/datasets/johan_tests_original_format_centroid/queried_training_docs'

SAVING_ORIGINAL_PATH = f'{SAVING_FOLDER}/test_original_10b.json'
SAVING_TOKENS_PATH = f'{SAVING_FOLDER}/test_tokens_10b.json'
SAVING_CLEANED_PATH = f'{SAVING_FOLDER}/test_cleaned_10b.json'

## Constants

In [5]:
questions = load_json(BM25_QUESTIONS)

## Extracting tokens for valid documents
Only documents with abstract will be kept.

One dict cleaned-tokenized, another only cleaned.

In [6]:
tokenized_unique_docs, cleaned_unique_docs = centroid_retrieval.extract_unique_doc_info(
    questions=questions['questions']
)

Extracting unique doc info: 100%|███████████████| 90/90 [00:04<00:00, 22.44it/s]


## Tokenize question body and replace documents with tokenized documents

In [7]:
tokenized_questions, question_solving_doc_ids = centroid_retrieval.select_questions_useful_documents(
    questions=questions['questions'],
    unique_docs=tokenized_unique_docs,
    use_tokens=True
)

Selecting useful documents: 100%|█████████████| 90/90 [00:00<00:00, 5965.92it/s]


In [8]:
tokenized_questions = {'questions': tokenized_questions}

## Clean question body and replace documents with cleaned documents

In [9]:
cleaned_questions, _ = centroid_retrieval.select_questions_useful_documents(
    questions=questions['questions'],
    unique_docs=cleaned_unique_docs,
    use_tokens=False
)

Selecting useful documents: 100%|█████████████| 90/90 [00:00<00:00, 7629.86it/s]


In [10]:
cleaned_questions = {'questions': cleaned_questions}

## Extracting unique tokenized abstracts and titles for word2vec

In [11]:
unique_abstract_tokens, unique_title_tokens = centroid_retrieval.extract_unique_titles_and_abstracts(
    tokenized_unique_docs=tokenized_unique_docs,
    question_solving_doc_ids=question_solving_doc_ids
)

In [12]:
len(unique_abstract_tokens), len(unique_title_tokens)

(7477, 7477)

In [13]:
len(list(unique_abstract_tokens.values())[0]), len(list(unique_title_tokens.values())[0])

(207, 14)

## Extracting unique tokenized questions for word2vec

In [14]:
unique_question_tokens = centroid_retrieval.extract_unique_questions(
    tokenized_questions['questions']
)

In [15]:
len(unique_question_tokens)

90

In [16]:
len(list(unique_question_tokens.values())[0])

15

## Loading models w2vec models for questions, titles and abstracts

In [17]:
question_w2vec_model = centroid_retrieval.load_bio_w2vec_model(LOADING_QUESTION_W2V_PATH)

In [18]:
abstract_w2vec_model = centroid_retrieval.load_bio_w2vec_model(LOADING_ABSTRACT_W2V_PATH)

In [19]:
title_w2vec_model = centroid_retrieval.load_bio_w2vec_model(LOADING_TITLE_W2V_PATH)

## Calculating centroids

In [20]:
question_centroids = centroid_retrieval.calculate_centroids_test(
    text_tokens=unique_question_tokens,
    model=question_w2vec_model
)

Extracting centroids: 100%|██████████████████| 90/90 [00:00<00:00, 30795.18it/s]


In [21]:
abstract_centroids = centroid_retrieval.calculate_centroids_test(
    text_tokens=unique_abstract_tokens,
    model=abstract_w2vec_model
)

Extracting centroids: 100%|███████████████| 7477/7477 [00:01<00:00, 4921.83it/s]


In [22]:
title_centroids = centroid_retrieval.calculate_centroids_test(
    text_tokens=unique_title_tokens,
    model=title_w2vec_model
)

Extracting centroids: 100%|██████████████| 7477/7477 [00:00<00:00, 38293.37it/s]


## Calculating question cosine similarities to answers

In [23]:
question_similarities = centroid_retrieval.calculate_question_answer_similarity(
    tokenized_questions=tokenized_questions['questions'],
    question_centroids=question_centroids,
    abstract_centroids=abstract_centroids,
    title_centroids=title_centroids
)

  cosine_similarity = projection / normalization
Calculating cosine similarity: 100%|███████████| 90/90 [00:00<00:00, 529.54it/s]


## Calculating document scores for questions

In [24]:
question_scores = centroid_retrieval.calculate_centroid_score(
    questions_similarities=question_similarities['questions'],
    abstract_weight=ABSTRACT_WEIGHT,
    title_weight=TITLE_WEIGHT
)

Calculating centroid distance: 100%|█████████| 90/90 [00:00<00:00, 14448.72it/s]


## Selecting useful documents only from original question dictionaries
### Original dict (for Andres model)

In [25]:
centroid_retrieval.update_question_scores_from_raw_data(
    raw_questions=questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█| 90/90 [00:00<00:00, 24142.19it


In [26]:
centroid_retrieval.update_question_scores_from_raw_data(
    raw_questions=tokenized_questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█| 90/90 [00:00<00:00, 22513.71it


In [27]:
centroid_retrieval.update_question_scores_from_raw_data(
    raw_questions=cleaned_questions['questions'],
    question_scores=question_scores
)

Updating dictionary with centroid scores: 100%|█| 90/90 [00:00<00:00, 18506.10it


## Saving into disk

In [28]:
save_json(questions, SAVING_ORIGINAL_PATH)

In [29]:
save_json(tokenized_questions, SAVING_TOKENS_PATH)

In [30]:
save_json(cleaned_questions, SAVING_CLEANED_PATH)