created 7 Aug 2023, ⛳ Areej

This notebook is for BioCreative Normalisation after the mention extraction, I will train a model with mentions and concepts from HPO then I'll evaluate using mentions extracted by T5_NER


---


Most of code incorporated from: https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank

In [None]:
!pip install -U sentence-transformers rank_bm25

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import csv
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

In [None]:
## Trainer imports ##
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, losses, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime

In [None]:
######### TRAINING BLOCK #############
#https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark_continue_training.py
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#Check if dataset exsist. If not, download and extract  it
HPO_Training_path = '/content/drive/MyDrive/biocreative/Linking/AppendedTraining.tsv'
model_name = 'all-roberta-large-v1'
train_batch_size = 32
num_epochs = 20
model_save_path = '/content/HPO-Linking-Training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)
# Convert the dataset to a DataLoader ready for training
logging.info("Read HPO-Linking train dataset")

train_samples = []

with open(HPO_Training_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = 1
        inp_example = InputExample(texts=[row['Spans'], row['Terms']],label=float(score)) #, label=float(score)
        train_samples.append(inp_example)

Downloading (…)eaf99/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)a0f59eaf99/README.md:   0%|          | 0.00/9.84k [00:00<?, ?B/s]

Downloading (…)f59eaf99/config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)f99/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading (…)0f59eaf99/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)eaf99/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading (…)af99/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)0f59eaf99/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)59eaf99/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
print(train_samples[0])

<InputExample> label: 1.0, texts: High arched palate; High palate


In [None]:
train_dataloader = DataLoader(train_samples, batch_size=train_batch_size)
#train_loss = losses.CosineSimilarityLoss(model=model)
train_loss = losses.MegaBatchMarginLoss(model=model)
#train_loss = losses.MultipleNegativesRankingLoss(model=model)


# Development set: Measure correlation between cosine score and gold labels
#logging.info("Read STSbenchmark dev dataset")
#evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
model = SentenceTransformer(model_save_path)
##### TRAINING BLOCK ENDS HERE #####

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

Iteration:   0%|          | 0/446 [00:00<?, ?it/s]

In [None]:
#We use the Bi-Encoder to encode all concepts in HPO (HP2Terms.tsv), so that we can use it with sematic search
#bi_encoder = model
#bi_encoder = SentenceTransformer('all-roberta-large-v1')
bi_encoder = SentenceTransformer('/content/drive/MyDrive/HPO-Linking-Training-all-roberta-large-v1-2023-08-15_06-14-26')
bi_encoder.max_seq_length = 32     #Max length of one HPO term is 32
top_k = 32                          #Number of candidates we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 32 terms. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-electra-base')
#cross_encoder = crossmodel

Downloading (…)lve/main/config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
#concepts list
HPOdata = []
terms = []
hpoID = []
with open("/content/drive/MyDrive/biocreative/HPO/HPO_SynonymsAll.tsv", 'r', encoding='utf-8') as tsv_file:
    reader = csv.reader(tsv_file, delimiter='\t')
    for row in reader:
        HPOdata.append(row)
        terms.append(row[1])
        hpoID.append(row[0])
print("Total Rows:", len(HPOdata))

# We encode all terms into our vector space. This takes about 3-5 minutes
terms_embeddings = bi_encoder.encode(terms, convert_to_tensor=True, show_progress_bar=True)
print(terms[3])
print(HPOdata[6])

Total Rows: 14934


Batches:   0%|          | 0/467 [00:00<?, ?it/s]

Mode of inheritance
['HP:0000008', 'Abnormal morphology of female internal genitalia']


In [None]:
# We also compare the results to lexical search (keyword search). Here, we use
# the BM25 algorithm which is implemented in the rank_bm25 package.
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np

# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

tokenized_terms = []
for term in tqdm(terms):
    tokenized_terms.append(bm25_tokenizer(term))

bm25 = BM25Okapi(tokenized_terms)

  0%|          | 0/14934 [00:00<?, ?it/s]

In [None]:
# This function will search all HPO ontology for terms that matches the entity span
def search(span):
    if span == "NA":
      return "NA"
    #print("Extracted Span:", span)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(span))
    top_n = np.argpartition(bm25_scores, -5)[-5:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    #print("Top-3 lexical search (BM25) hits")
    #for hit in bm25_hits[0:3]:
        #print("\t{:.3f}\t{}\t{}".format(hit['score'], terms[hit['corpus_id']].replace("\n", " "), hpoID[hit['corpus_id']].replace("\n", " ")))

    ##### Sematic Search #####
    # Encode the span using the bi-encoder and find potentially relevant passages
    span_embedding = bi_encoder.encode(span, convert_to_tensor=True)
    #span_embedding = span_embedding.cuda()
    hits = util.semantic_search(span_embedding, terms_embeddings, top_k=top_k) #performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings, I can change the score_function
    hits = hits[0]  # Get the hits for the first span

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[span, terms[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    #print("\n-------------------------\n")
    #print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    #for hit in hits[0:3]:
        #print("\t{:.3f}\t{}\t{}".format(hit['score'], terms[hit['corpus_id']].replace("\n", " "), hpoID[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    #print("\n-------------------------\n")
    #print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    #for hit in hits[0:3]:
     #   print("\t{:.3f}\t{}\t{}".format(hit['cross-score'], terms[hit['corpus_id']].replace("\n", " "), hpoID[hit['corpus_id']].replace("\n", " ")))
    top_hit = None  # Initialize the variable to None

    for hit in hits[0:3]:
        cross_score = hit['cross-score']
        term = terms[hit['corpus_id']].replace("\n", " ")
        hpo_id = hpoID[hit['corpus_id']].replace("\n", " ")

        # Print the information for the current hit
        #print("\t{:.3f}\t{}\t{}".format(cross_score, term, hpo_id))

        # Store the hpoID of the first hit in top_hit
        if top_hit is None:
          top_hit = hpo_id
    return top_hit

In [None]:
tsv_file = "/content/drive/MyDrive/biocreative/FinalResults/FlanT5xl-LoRa-1024/Merged/TBL_MergedTest_FlanT5LoRa1024.tsv"
new_tsv_file = "Linked.tsv"

with open(tsv_file, 'r', newline='', encoding='utf-8') as input_tsv:
    reader = csv.DictReader(input_tsv, delimiter='\t')
    rows = list(reader)

for row in rows:
    row['HPO Term'] = search(row['Spans'])

with open(new_tsv_file, 'w', newline='', encoding='utf-8') as output_tsv:
    fieldnames = rows[0].keys()
    writer = csv.DictWriter(output_tsv, delimiter='\t', fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

In [None]:
####### writes preprocessed file with KEY FINDINGS ONLY and drop the polarity column ########
#### because BioCreative Evaluation script will evaluate only the key findings #####
with open("Linked.tsv", "r") as infile, open("EvalScript.tsv", "w") as outfile:
    # Write the header to the output file
    outfile.write("ObservationID\tText\tHPO Term\tSpans\n")

    # Skip the header line in the input file
    next(infile)

    # Iterate through the input file and lists in parallel
    for line in infile:
        parts = line.strip().split("\t")
        obsID = parts[0]
        Texts = parts[1]
        polarity = parts[2]
        spans = parts[4]
        hpo = parts[5]

        # Write the data to the output file
        if polarity == "NA":
          outfile.write(f"{obsID}\t{Texts}\t{hpo}\t{spans}\n")

In [None]:
import csv

# Load the gold standard file
gold_standard_file = "/content/drive/MyDrive/biocreative/dataset/BioCreativeVIII3_ValSet.tsv"
gold_standard_hpo_terms = []

with open(gold_standard_file, 'r', newline='', encoding='utf-8') as gold_csv:
    reader = csv.DictReader(gold_csv, delimiter='\t')
    for row in reader:
        gold_standard_hpo_terms.append(row['HPO Term'])

# Load the updated CSV file
updated_csv_file = "Link_WOT.csv"
updated_hpo_terms = []

with open(updated_csv_file, 'r', newline='', encoding='utf-8') as updated_csv:
    reader = csv.DictReader(updated_csv)
    for row in reader:
        updated_hpo_terms.append(row['HPO Term'])

print(updated_hpo_terms[4])

HP:0000286


In [None]:
 # Initialize evaluation metrics variables
true_positive = 0
false_negative = 0
false_positive = 0
# Iterate through the extracted HPO terms and compare with gold standard
for updated_term, gold_term in zip(updated_hpo_terms, gold_standard_hpo_terms):

    # Compare HPO Terms and update TP and FN counts
    if updated_term == gold_term:
        true_positive += 1
    else:
        false_negative += 1

print("True Positives (TP):", true_positive)
print("False Negatives (FN):", false_negative)


True Positives (TP): 603
False Negatives (FN): 131


In [None]:
exact_match = true_positive / len(gold_standard_hpo_terms)

# Calculate precision, recall, and F1-score
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1 = 2 * (precision * recall) / (precision + recall)

print("Exact Match:", exact_match)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Exact Match: 0.8215258855585831
Precision: 1.0
Recall: 0.8215258855585831
F1-Score: 0.9020194465220642
