## Imports

In [1]:
%load_ext autoreload
%autoreload 2

from src.elastic_search_utils.elastic_utils import load_json, save_json
from src.cemb_bm25.centroid_retrieval import load_bio_w2vec_model
from src.graphs import graphs_embedding

## Parameters

In [2]:
# Cleaned dictionary path
ENTITY_DICTIONARY = '/datasets/johan_tests_original_format_centroid/merged_training_docs/test_entity_10b-testset3.json'

# Data nature (training-test)
DATA_NATURE = 'test'

In [3]:
LOADING_FOLDER = '/datasets/johan_tests_original_format_centroid/merged_training_docs'

LOADING_ABSTRACT_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_abstract_model_10b_train.bin'
LOADING_TITLE_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_title_model_10b_train.bin'
LOADING_QUESTION_W2V_PATH = f'{LOADING_FOLDER}/Bio_Word2Vec_doc_question_model_10b_train.bin'

In [4]:
BASE_SAVING_FOLDER = '/datasets/johan_tests_original_format_graphs'

In [5]:
# Max number of documents in graph
MAX_DOCUMENTS_GRAPH = 100

# Max entities in graph
MAX_ENTITIES_GRAPH = 20

# similarity matrix size
SIMILARITY_SHAPE = (MAX_DOCUMENTS_GRAPH, MAX_ENTITIES_GRAPH)

In [6]:
# Threshold for relevance of documents
SCORE_THRESHOLD = 0.06

In [7]:
# defined value for relevance
SIMILARITY_RELEVANCE = 0.7

In [8]:
SAVING_FOLDER = (
    f'{BASE_SAVING_FOLDER}/' +
    f'similarity_shape_{MAX_DOCUMENTS_GRAPH}_{MAX_ENTITIES_GRAPH}__' +
    f'score_threshold_{SCORE_THRESHOLD}__' +
    f'similarity_relevance_{SIMILARITY_RELEVANCE}/{DATA_NATURE}'
).replace('.','')
SAVING_FOLDER

'/datasets/johan_tests_original_format_graphs/similarity_shape_100_20__score_threshold_006__similarity_relevance_07/test'

## Constants

In [9]:
questions = load_json(ENTITY_DICTIONARY)

In [10]:
question_w2vec_model = load_bio_w2vec_model(LOADING_QUESTION_W2V_PATH)

In [11]:
abstract_w2vec_model = load_bio_w2vec_model(LOADING_ABSTRACT_W2V_PATH)

In [12]:
title_w2vec_model = load_bio_w2vec_model(LOADING_TITLE_W2V_PATH)

In [13]:
models = {
    'question': question_w2vec_model,
    'abstract': abstract_w2vec_model,
    'title': title_w2vec_model
}

### Saving graphs to saving folder

In [14]:
graphs_embedding.make_all_question_graphs(
    questions=questions,
    score_threshold=SCORE_THRESHOLD,
    similarity_relevance=SIMILARITY_RELEVANCE,
    similarity_shape=SIMILARITY_SHAPE,
    models=models,
    saving_path=SAVING_FOLDER
)

Storing question graphs to /datasets/johan_tests_original_format_graphs/similarity_shape_100_20__score_threshold_006__similarity_relevance_07/test: 100%|██████████████████████████| 90/90 [01:08<00:00,  1.31it/s]
