In [1]:
from helpers.Data import load_queries
from config import GlobalConfig
import torch
from transformers import AutoTokenizer


cfg = GlobalConfig()

tokenize_batch_size = cfg.tokenize.tokenize_batch_size
dictionary_max_length = cfg.tokenize.dictionary_max_length
queries_max_length = cfg.tokenize.queries_max_length
dictionary_max_chars_length = cfg.tokenize.dictionary_max_chars_length

mention_start_special_token = cfg.tokenize.special_tokens_dict["mention_start"]
mention_end_special_token = cfg.tokenize.special_tokens_dict["mention_end"]

use_cuda = torch.cuda.is_available()
device = "cuda"    if use_cuda else "cpu"

tokenizer = AutoTokenizer.from_pretrained(cfg.model.model_name, use_fast=True)
tokenizer.add_special_tokens(cfg.tokenize.special_tokens)


train_queries = load_queries(
            data_dir=cfg.paths.queries_raw_dir,
            queries_max_length=queries_max_length,
            special_token_start=mention_start_special_token ,
            special_token_end=mention_end_special_token,
            tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 691/691 [00:16<00:00, 41.78it/s]

annotation_skipped: 13





In [5]:
train_queries[30:35]

[('neurodegeneration',
  'MESH:D019636',
  'Our findings suggest that subtle differences in the course of the [MS] neurodegeneration [ME] in HD may allow interacting genes to exert gender specific effects upon AO',
  './data/raw/traindev/10051007.txt',
  1051,
  1068),
 ('HD',
  'MESH:D006816',
  'Our findings suggest that subtle differences in the course of the neurodegeneration in [MS] HD [ME] may allow interacting genes to exert gender specific effects upon AO',
  './data/raw/traindev/10051007.txt',
  1073,
  1075),
 ('Familial deficiency of the seventh component of complement',
  'MESH:C566443',
  '[MS] Familial deficiency of the seventh component of complement [ME] associated with recurrent bacteremic infections due to Neisseria',
  './data/raw/traindev/100562.txt',
  0,
  58),
 ('bacteremic infections due to Neisseria',
  'MESH:D016870',
  'Familial deficiency of the seventh component of complement associated with recurrent [MS] bacteremic infections due to Neisseria [ME]',
  './

In [9]:
train_queries[4120]

('ovarian cancers',
 'MESH:D010051',
 'To study the potential role of BRCA1 in sporadic carcinogenesis, we analysed the genomic DNA of tumour and normal fractions of 47 [MS] ovarian cancers [ME] for mutations in BRCA1 using the single-strand conformation polymorphism technique',
 './data/raw/traindev/7795652.txt',
 398,
 413)

In [6]:
from helpers.Data import load_dictionary
dictionary_max_chars_length = cfg.tokenize.dictionary_max_chars_length

_, dictionary_cuis, dictionary_names_annotated = load_dictionary(cfg.paths.dictionary_raw_path, 
                                     special_token_start=mention_start_special_token, 
                                     special_token_end=mention_end_special_token,
                                    dictionary_max_chars_length=dictionary_max_chars_length,
                                    add_synonyms=True
                                     )


pre process dictionary:   0%|          | 0/90599 [00:00<?, ?it/s]

pre process dictionary: 100%|██████████| 90599/90599 [00:00<00:00, 872575.73it/s]
annotating dictionary: 100%|██████████| 90510/90510 [00:00<00:00, 184978.00it/s]


In [12]:
for idx, d in enumerate(dictionary_cuis):
    if d == 'MESH:D010051':
        print("found: " , d, " idx: " , idx)
        break
    

found:  MESH:D010051  idx:  65041


In [8]:
dictionary_names_annotated[40:45]

['[MS] aldosterone deficiency due to defect in 18-hydroxylase [ME]  ; 18-hydroxylase deficiency ; cmo i deficiency ; aldosterone deficiency i ; steroid 18-oxidase deficiency ; hyperreninemic hypoaldosteronism, familial, type i',
 '[MS] aldosterone deficiency due to defect in steroid 18-hydroxylase [ME]  ; steroid 18-hydroxylase deficiency ; corticosterone methyloxidase type i deficiency ; aldosterone deficiency due to deficiency of steroid 18-oxidase ; corticosterone methyloxidase type ii deficiency ; fhha1b',
 '[MS] aldosterone deficiency due to deficiency of steroid 18-oxidase [ME]  ; cmo ii deficiency ; corticosterone methyloxidase type 1 deficiency ; fhha1b ; 18-hydroxylase deficiency ; aldosterone deficiency i',
 '[MS] aldosterone deficiency i [ME]  ; cmo i deficiency ; steroid 18-oxidase deficiency ; corticosterone methyloxidase type i deficiency ; 18-oxidase deficiency ; aldosterone deficiency ii',
 '[MS] aldosterone deficiency ii [ME]  ; hyperreninemic hypoaldosteronism, famili

In [111]:

query_idx = 1
hard_positives_num = 2
hard_negatives_num = 1
query_cui = queries_cuis[query_idx]
current_query_candidates_idxs = new_cands[query_idx].tolist()
current_candidates_cuis = dictionary_cuis[current_query_candidates_idxs]


positive_positions = np.where(current_candidates_cuis == query_cui)[0]
candidates_idxs_available = list(set(range(topk))  - set(positive_positions)  )
positive_candidates_indexes = dictionary_cui_to_idx.get(query_cui, [])
available_positives = list(set(positive_candidates_indexes) - set(current_query_candidates_idxs))


if available_positives:
    # how many positives we will inject, in case available are less than the one in config
    positive_n = min(hard_positives_num, len(available_positives))
    #  random positive candidates, to choose from available positives (index of dictionary_cui)
    positive_candidates = np.array(available_positives[:positive_n])
    # random indexes in candidate list to be replaced
    candidates_idxs_to_be_replaced = candidates_idxs_available[:positive_n]
    new_cands_2[query_idx, candidates_idxs_to_be_replaced] = torch.from_numpy(positive_candidates)



new_cands_2[query_idx]

tensor([ 3, 11,  6,  1, 12])

In [112]:
import torch

query_cui = queries_cuis[query_idx]
current_idxs = np.array(new_cands[query_idx])
current_cuis = dictionary_cuis[current_idxs]

negative_mask = (current_cuis != query_cui)
available_positions = np.flatnonzero(negative_mask)

pos_dict_idxs = dictionary_cui_to_idx.get(query_cui, [])
available_pos_dict = np.setdiff1d(pos_dict_idxs, current_idxs, assume_unique=False)

pos_n = min(hard_positives_num,
            len(available_pos_dict),
            len(available_positions))
chosen_pos_dict = available_pos_dict[:pos_n]
chosen_slots = available_positions[:pos_n]
new_cands_1[query_idx, chosen_slots] = torch.from_numpy(chosen_pos_dict)
available_positions = np.setdiff1d(available_positions, chosen_slots, assume_unique=False)


new_cands_1[query_idx]

tensor([ 3, 11,  6,  1, 12])

In [74]:
import torch

query_idx = 1
hard_positives_num = 2
hard_negatives_num = 1
query_cui = queries_cuis[query_idx]
current_query_candidates_idxs = new_cands[query_idx].tolist()
current_candidates_cuis = dictionary_cuis[current_query_candidates_idxs]
positive_positions = np.where(current_candidates_cuis == query_cui)[0]

candidates_idxs_to_be_replaced = np.array([])
candidates_idxs_available = list(set(range(topk))  - set(positive_positions)  )

print(f"current candidates are: {new_cands[query_idx]}")

positive_candidates_indexes = dictionary_cui_to_idx.get(query_cui, [])
if len(positive_candidates_indexes) > 0:
    available_positives = list(set(positive_candidates_indexes) - set(current_query_candidates_idxs))
    if available_positives:
        positive_n = min(hard_positives_num, len(available_positives))
        positive_candidates = np.random.choice(available_positives, size=positive_n, replace=False)

        candidates_available_idxs = candidates_idxs_available
        candidates_idxs_to_be_replaced = np.random.choice(candidates_idxs_available, size=positive_n, replace=False)
        new_cands[query_idx, candidates_idxs_to_be_replaced] = torch.from_numpy(positive_candidates)

print(f"after injecting {hard_positives_num} positive are: {new_cands[query_idx]}")
candidates_idxs_available = list(set(candidates_idxs_available) - set(candidates_idxs_to_be_replaced))
prev_cands_idxs = previous_epoch_candidates[query_idx]
prev_dictionary_cuis = dictionary_cuis[prev_cands_idxs]
neg_mask = prev_dictionary_cuis != query_cui
hard_negative_indexes = prev_cands_idxs[neg_mask]

if len(hard_negative_indexes) > 0:
    negatives_n = min(hard_negatives_num, len(hard_negative_indexes))
    hard_negative_candidates = np.random.choice(hard_negative_indexes, size=negatives_n, replace=False)
    # candidates_to_replace_positive
    candidates_idxs_to_be_replaced = np.random.choice(candidates_idxs_available, size=negatives_n, replace=False)

    new_cands[query_idx, candidates_idxs_to_be_replaced] = torch.from_numpy(hard_negative_candidates)

print(f"after injecting {hard_negatives_num} negatives are: {new_cands[query_idx]}")

new_cands[query_idx]

current candidates are: [ 0 11  6  1 12]
after injecting 2 positive are: [ 0 11  6  1 12]
after injecting 1 negatives are: [ 3 11  6  1 12]


array([ 3, 11,  6,  1, 12])