Import Libraries

In [1]:
import numpy as np
import json
import matplotlib.pyplot as plt
import pandas as pd
from openai import OpenAI
import pandas as pd
import re

In [2]:
def response_zeroshot(model_id,client, clinical_note):
    prompt = f"""
    You are an expert in extracting phenotype terms and their corresponding HPO IDs from clinical notes. Your task is to identify all relevant phenotype terms mentioned in the given clinical note and map them to their correct HPO IDs.

    Output the results in this format:
    <phenotype term> | <HPO ID>
    
    Input Clinical Note:
    {clinical_note}
    
    Output:
    """
    
    # Send the request to GPT-4
    response = client.chat.completions.create(
    model=model_id,
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant specialized in extracting medical phenotypes, syndromes, diseases and mapping them to HPO IDs."
        },
        {
            "role": "user",
            "content": prompt
        }
    ]
)
      # Extract and print the output
    output = response.choices[0].message.content
    def post_process(output):
        # Split the output into lines
        lines = output.strip().split("\n")
        
        # Extract the HPO terms and IDs
        hpo_names,ids=[],[]
        for line in lines:
            if "|" in line:
                name, hpo_id = line.split("|")
                hpo_names.append(name.strip())
                ids.append(hpo_id.strip())
        return hpo_names,ids
    A,B=post_process(output)
    return A,B    # return names and ids

def response_finetuning(model_id,client,query):
    response = client.chat.completions.create(
            model=model_id,  # Replace with your fine-tuned model ID
            messages=[
                {
                    "role": "system",
                    "content": "You are an assistant specialized in extracting phenotype terms and their corresponding HPO IDs from clinical notes."
                },
                {
                    "role": "user",
                    "content": f"Extract relevant phenotype terms and their HPO IDs from the following clinical note:\n{query}"
                }
            ]
        )
    x= response.choices[0].message.content
    def parse_gpt_output(text):
        text = text.strip()
        terms = text.split(",")  # Split by commas for predictions
        parsed = []
        ids=[]
        for term in terms:
            term = term.strip()
            if "(" in term and ")" in term:
                # Split only on the first "(" to avoid unpacking issues
                name,_ = term.split("(", 1)
                id_ = term.split("(")[1].split(")")[0]
                parsed.append(name.strip().lower())
                ids.append(id_)
        ids=[id.replace('_', ':') for id in ids]
        return parsed,ids
    A,B=parse_gpt_output(x)
    return A,B



def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def get_top_k_similarities(embeddings, query, k):
    similarities = np.array([cosine_similarity(query, embedding) for embedding in embeddings])
    top_k_indices = similarities.argsort()[-k:][::-1]
    return top_k_indices, similarities[top_k_indices]


In [4]:
h2g=pd.read_csv('2024/phenotype_to_genes.txt',sep='\t')
# Select specific columns from the DataFrame
h2g = h2g[['hpo_id', 'gene_symbol']]
h2g= h2g.drop_duplicates(subset=['hpo_id', 'gene_symbol'])
# create a dictionary with hpo_id as key and it was associated with many gene symbols, so append it as list of strings
hpo_gene_dict = {}
for i in range(len(h2g)):
    if h2g['hpo_id'].iloc[i] in hpo_gene_dict:
        hpo_gene_dict[h2g['hpo_id'].iloc[i]].append(h2g['gene_symbol'].iloc[i])
    else:
        hpo_gene_dict[h2g['hpo_id'].iloc[i]] = [h2g['gene_symbol'].iloc[i]]

print("hp to genes dictionary created")

hpo_dict = {}
with open("2024/hpo_dict.txt") as f:
    for line in f:
        (key, val) = line.split(":H")
        # remove newline character  
        val = val.rstrip()
        hpo_dict[key] = "H"+val

print("hpo dictionary created")

# Load the embeddings

df=pd.read_csv('2024/HPO_embeddings_40k.csv')
db= df.to_numpy()
print("embeddings loaded")

# child parent dict
child_parent_dict = {}
with open("2024/child_parent_dict_merged.json") as f:
    child_parent_dict = json.load(f)

print("child parent dictionary created")


hp to genes dictionary created
hpo dictionary created
embeddings loaded
child parent dictionary created


In [9]:
import numpy as np

def process_embeddings(content, db, hpo_dict, client):
    """
    Process content to compute embeddings, find top 1 similar rows in db, and map to HPO terms and IDs.
    
    Args:
    - content: List of lists of text labels.
    - db: Database of embeddings for similarity comparison.
    - hpo_dict: Dictionary mapping HPO terms to IDs.
    - client: API client for generating embeddings.
    
    Returns:
    - Sep: List of lists of unique HPO terms and IDs, grouped by the number of terms in each sublist of content.
    """
    # Step 1: Calculate total number of terms and initialize embedding array
    total_terms = sum(len(i) for i in content)
    embeddings_20 = np.zeros((total_terms, 1536))

    # Step 2: Generate embeddings for each term
    z = 0
    for i in range(len(content)):
        for term in content[i]:
            response = client.embeddings.create(
                input=term,
                model="text-embedding-3-small"
            )
            embeddings_20[z] = response.data[0].embedding
            z += 1
            if z % 500 == 0:
                print(f"Processed {z} embeddings")

    # Step 3: Find top 1 similar rows in db
    top_1_indices = []
    for i in range(total_terms):
        top_1_indices.append(get_top_k_similarities(db, embeddings_20[i], 1)[0][0])
        if i % 50 == 0:
            print(f"Processed similarity for row {i}")

    # Step 4: Retrieve HPO terms and IDs
    hpo_terms = [
        f"{list(hpo_dict.keys())[index]} {list(hpo_dict.values())[index]}"
        for index in top_1_indices
    ]

    # Step 5: Split HPO terms into sublists based on the number of terms in content
    no_ids = [len(i) for i in content]
    Sep = []
    start = 0
    for num_terms in no_ids:
        Sep.append(list(set(hpo_terms[start:start + num_terms])))
        start += num_terms

    return Sep

def load_val_data(val_data_path):
    # Load validation data
    with open(val_data_path, "r") as f:
        val_data = json.load(f)

    val_eval_data = []  
    for i in val_data:
        val_eval_data.append(i["output"])
    return val_data,val_eval_data




### Choose the model:

#### ---------------------------------------Finetuned models---------------------------------------
1) GPT4o-mini-2024-07-18 : ft:gpt-4o-mini-2024-07-18:iisc-bangalore::AYf5TC9S
2) GPT4o-2024-08-06      : ft:gpt-4o-2024-08-06:iisc-bangalore::AZ03ME6y

#### ------------------------------------Base models for zeroshot----------------------------------

3) GPT4o-mini-2024-07-18 : gpt-4o-mini-2024-07-18
4) GPT4o-2024-08-06      : gpt-4o-2024-08-06


In [6]:
# openai api key
client = OpenAI(api_key="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

In [45]:
# Load the data
sample_data="/test/sample.json"
val_data,_ =load_val_data(sample_data)
print("Data loaded")
print("=====================================")
print(val_data)


Data loaded
[{'input': 'Nevoid basal cell carcinoma syndrome (NBCCS) is rare in black persons. We describe an 11-year-old black boy with NBCCS who presented with exotropia and a painful, expanding, cystic mass in the left posterior alveolar ridge. Further examination revealed odontogenic keratocysts with palmar and plantar pitting. Less than 5% of reported patients with NBCCS are black. To our knowledge, this is the first report of a black patient with NBCCS presenting with exotropia and an impacted molar displaced into the orbit by an odontogenic keratocyst.', 'output': ' basal cell carcinoma | HP_0002671\n exotropia | HP_0000577\n cystic mass in the left posterior alveolar ridge | HP_0006477\n odontogenic keratocysts | HP_0010603\n palmar and plantar pitting | HP_0010612\n END'}]


In [46]:
spans,HP_ids=[],[]

for i in val_data:
    span,ids=response_finetuning("ft:gpt-4o-mini-2024-07-18:iisc-bangalore::AYf5TC9S",client,i["input"])
    # span,ids =response_zeroshot("gpt-4o-mini-2024-07-18",client,sample_data)
    spans.append(span)
    HP_ids.append(ids)
print("phenotypes extracted from clinical text")
print(spans)
print("The corresponding HPO IDs are:")
print(HP_ids)

phenotypes extracted from clinical text
[['basal cell carcinoma', 'exotropia', 'cystic mass in the left posterior alveolar ridge', 'odontogenic keratocysts', 'palmar and plantar pitting']]
The corresponding HPO IDs are:
[['HP:0002671', 'HP:0000577', 'HP:0006477', 'HP:0010603', 'HP:0010612']]


Normalization using embeddings

In [19]:
embed_spans=process_embeddings(spans, db, hpo_dict, client)               # pass lists of list of spans for multiple clinical notes
embed_spans

Processed similarity for row 0


[['odontogenic keratocysts of the jaw HP:0010603',
  'exotropia HP:0000577',
  'alveolar ridge overgrowth HP:0009085',
  'palmar pits HP:0010610',
  'basal cell carcinoma HP:0002671']]

In [20]:
# extract HP ids after embeddings normalisation
HP_ids_embed=re.findall(r'HP:\d+',str(embed_spans[0]) )
print(HP_ids_embed)

['HP:0010603', 'HP:0000577', 'HP:0009085', 'HP:0010610', 'HP:0002671']


ACMG 81 GENES

In [21]:
genes = [
    "APC", "RET", "BRCA1", "BRCA2", "PALB2", "SDHD", "SDHAF2", "SDHC", "SDHB", 
    "MAX", "TMEM127", "BMPR1A", "SMAD4", "TP53", "MLH1", "MSH2", "MSH6", "PMS2", 
    "MEN1", "MUTYH", "NF2", "FBN1", "TGFBR1", "TGFBR2", "SMAD3", "ACTA2", "MYH11", 
    "PKP2", "DSP", "DSC2", "TMEM43", "DSG2", "RYR2", "CASQ2", "TRDN", "BAG3", 
    "DES", "RBM20", "TNNC1", "TNNT2", "LMNA", "FLNC", "TTN", "CALM1", "CALM2", 
    "CALM3", "COL3A1", "LDLR", "APOB", "PCSK9", "MYH7", "MYBPC3", "TNNI3", 
    "TPM1", "MYL3", "ACTC1", "PRKAG2", "MYL2", "KCNQ1", "KCNH2", "SCN5A", "BTD", 
    "GLA", "OTC", "GAA", "STK11", "HFE", "ACVRL1", "ENG", "RYR1", "CACNA1S", 
    "HNF1A", "RPE65", "ATP7B", "TTR", "PTEN", "RB1", "TSC1", "TSC2", "VHL", "WT1"
]


### Genes extracted after embedding normalisation

In [22]:
# get the gene symbols associated with the extracted HP ids
genes_associated = []
for hp_id in HP_ids_embed:
    if hp_id in hpo_gene_dict:
        genes_associated.extend(hpo_gene_dict[hp_id])
genes_associated = list(set(genes_associated))
print("Genes extracted after embedding normalisation")
print("-------------------------------------------------------------------------------------------")
print(f"The total number of genes associated with the extracted HPO IDs are : {len(genes_associated)}")
print("-------------------------------------------------------------------------------------------")
print(genes_associated)

Genes extracted after embedding normalisation
-------------------------------------------------------------------------------------------
The total number of genes associated with the extracted HPO IDs are : 181
-------------------------------------------------------------------------------------------
['PCDHGC4', 'CACNA1A', 'WNT10A', 'BCOR', 'RMRP', 'CRB1', 'FBN1', 'P4HTM', 'ERCC3', 'SPATA7', 'ANKH', 'PTCH2', 'UBAP2L', 'XPC', 'TMC6', 'APC', 'POGZ', 'SIAH1', 'LYRM7', 'POLD1', 'KEAP1', 'ALDH3A2', 'RNF2', 'ADGRG1', 'SOX5', 'PHOX2A', 'ATP1A2', 'KRAS', 'PTCH1', 'ERCC5', 'OFD1', 'POLRMT', 'SLC6A8', 'RBM10', 'PTRH2', 'MYF5', 'ADD3', 'FOXE1', 'CIB1', 'RP1L1', 'WDR4', 'SLC35A2', 'HRAS', 'TYMS', 'CD96', 'MLH1', 'FAS', 'THOC2', 'FTO', 'EXOC8', 'NTRK1', 'SC5D', 'SNRPN', 'GALNT2', 'GNB1', 'PGM2L1', 'FGFR2', 'SLC32A1', 'PMS2', 'ATRX', 'BCORL1', 'PRPS1', 'PMS1', 'SMO', 'ATP2A2', 'CBS', 'AAGAB', 'PAX6', 'RERE', 'MSH2', 'ATP1A3', 'DYRK1A', 'ADNP', 'RPL10', 'POGLUT1', 'CHEK2', 'SLC25A46', 'EPCAM', 'FGF

### Genes extacted after embedding normalisation + additional of parent ids

In [23]:
# Adding parent IDs to the extracted HPO IDs
HP_ids_embed_parents = []
for hp_id in HP_ids_embed:
    HP_ids_embed_parents.append(hp_id)
    if hp_id in child_parent_dict:
        HP_ids_embed_parents.append(child_parent_dict[hp_id])
print(f"HP ids after addition of parent ids: {HP_ids_embed_parents}")

HP ids after addition of parent ids: ['HP:0010603', 'HP:0100612', 'HP:0000577', 'HP:0020049', 'HP:0009085', 'HP:0006477', 'HP:0010610', 'HP:0040211', 'HP:0002671', 'HP:0008069']


In [24]:
# get the gene symbols associated with the added parent HP ids
genes_associated_parents = []
for hp_id in HP_ids_embed_parents:
    if hp_id in hpo_gene_dict:
        genes_associated_parents.extend(hpo_gene_dict[hp_id])
genes_associated_parents = list(set(genes_associated_parents))
print("Genes extracted after adding parent IDs")
print("-------------------------------------------------------------------------------------------------------------")
print(f"The total number of genes associated with the extracted HPO IDs and their parent IDs are : {len(genes_associated_parents)}")
print("-------------------------------------------------------------------------------------------------------------")
print(genes_associated_parents)

Genes extracted after adding parent IDs
-------------------------------------------------------------------------------------------------------------
The total number of genes associated with the extracted HPO IDs and their parent IDs are : 690
-------------------------------------------------------------------------------------------------------------
['MYOD1', 'SLC25A12', 'KRT17', 'WNT10A', 'BCOR', 'PEX10', 'ITGB4', 'CSTA', 'SMARCAD1', 'SMC5', 'PEX19', 'XPC', 'PTCH2', 'UBAP2L', 'FLG', 'APC', 'IL6', 'POGZ', 'EZH2', 'LYRM7', 'GJB4', 'RAF1', 'SMARCE1', 'SEC23B', 'PEPD', 'ADGRG1', 'SLC39A13', 'DPYD', 'RAB11B', 'MCTP2', 'TRAF7', 'CAST', 'NUP107', 'NIPBL', 'GJB2', 'OFD1', 'FCN3', 'SLC6A8', 'NF2', 'MSH3', 'RBM10', 'MYF5', 'SMPD4', 'BAP1', 'ADD3', 'FOXE1', 'CIB1', 'AQP5', 'KRT6C', 'MT-TS1', 'CD96', 'FAS', 'NLRP1', 'RIPK4', 'TPM2', 'KRT16', 'PGM2L1', 'WDR37', 'CST6', 'SULT2B1', 'CEP57', 'PRPS1', 'PMS1', 'WAC', 'SLC45A2', 'CBS', 'AAGAB', 'RHOH', 'PARN', 'RERE', 'LSS', 'RTL1', 'MBTPS2', 'MUSK',

### Genes extacted after embedding normalisation + additional of parent ids + ACMG 81 genes addition

In [25]:
# Adding the ACMG 81 genes to the list of genes
final_genes = genes_associated_parents + genes
final_genes = list(set(final_genes))
print("Final list of genes after 3 optimizations: embeddings normalisation+parent ids+ACMG 81 genes")
print("------------------------------------------------------------------------------------------------------------------------")
print(f"The total number of genes associated with the extracted HPO IDs, their parent IDs, and the ACMG 81 genes are : {len(final_genes)}")
print("------------------------------------------------------------------------------------------------------------------------")
print(final_genes)


Final list of genes after 3 optimizations: embeddings normalisation+parent ids+ACMG 81 genes
------------------------------------------------------------------------------------------------------------------------
The total number of genes associated with the extracted HPO IDs, their parent IDs, and the ACMG 81 genes are : 743
------------------------------------------------------------------------------------------------------------------------
['MYOD1', 'SLC25A12', 'KRT17', 'WNT10A', 'BCOR', 'PEX10', 'ITGB4', 'RB1', 'CSTA', 'SMARCAD1', 'SMC5', 'PEX19', 'XPC', 'UBAP2L', 'PTCH2', 'FLG', 'APC', 'IL6', 'POGZ', 'EZH2', 'LYRM7', 'TPM1', 'GJB4', 'RAF1', 'SMARCE1', 'SEC23B', 'PEPD', 'ADGRG1', 'SLC39A13', 'DPYD', 'RAB11B', 'MCTP2', 'TRAF7', 'CAST', 'NUP107', 'NIPBL', 'GJB2', 'OFD1', 'FCN3', 'SLC6A8', 'MSH3', 'RBM10', 'MYF5', 'SMPD4', 'BAP1', 'ADD3', 'FOXE1', 'CIB1', 'AQP5', 'KCNQ1', 'KRT6C', 'MT-TS1', 'CD96', 'FAS', 'NLRP1', 'RIPK4', 'TPM2', 'KRT16', 'PGM2L1', 'WDR37', 'CST6', 'SULT2B1', 'CEP

### BioMED_NER

In [37]:
def merge_consecutive_entities(entities, text):
    entities = sorted(entities, key=lambda x: x['start'])
    merged_entities = []
    current_entity = None

    for entity in entities:
        if current_entity is None:
            current_entity = entity
        elif (
            entity['entity_group'] == current_entity['entity_group'] and
            (entity['start'] <= current_entity['end'])
        ):
            # Merge based on start and end positions in the text
            current_entity['end'] = max(current_entity['end'], entity['end'])
            current_entity['word'] = text[current_entity['start']:current_entity['end']]
            current_entity['score'] = (current_entity['score'] + entity['score']) / 2  
        else:
            merged_entities.append(current_entity)
            current_entity = entity
    if current_entity:
        merged_entities.append(current_entity)

    return merged_entities


we defined some set of rules to extract phenotypes from clinical text

In [27]:
def get_spans(output):
    x = []
    i = 0
    while i < len(output):
        temp = ""
        if i + 1 < len(output) and output[i]['entity_group'] == 'Detailed_description' and output[i + 1]['entity_group'] == 'Disease_disorder':
            temp = output[i]['word'] + " " + output[i + 1]['word']
            x.append(temp)
            i += 2  # Skip the next element as it is already processed
            continue
        if i + 1 < len(output) and output[i]['entity_group'] == 'Detailed_description' and output[i + 1]['entity_group'] == 'Sign_symptom':
            temp = output[i]['word'] + " " + output[i + 1]['word']
            x.append(temp)
            i += 2  # Skip the next element as it is already processed
            continue
        if i+1< len(output) and output[i]['entity_group'] == 'Biological_structure' and output[i+1]['entity_group'] == 'Sign_symptom':
            temp = output[i]['word'] + " " + output[i+1]['word']
            x.append(temp)
            i += 2
            continue
        if output[i]['entity_group'] == 'SIGN_SYMPTOM':
            x.append(output[i]['word'])
        if output[i]['entity_group'] == 'Disease_disorder':
            x.append(output[i]['word'])
        i += 1
    return x

In [28]:
from transformers import pipeline
pipe = pipeline("token-classification", model="Helios9/BIOMed_NER", aggregation_strategy='simple')

2025-01-30 20:52:26.588815: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-30 20:52:26.622930: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-30 20:52:26.622966: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-30 20:52:26.624036: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-30 20:52:26.629671: I tensorflow/core/platform/cpu_feature_guar

In [39]:
# Load the data
sample_data="/test/sample.json"
val_data,_ =load_val_data(sample_data)
print(val_data)
print("data loaded")
spans,HP_ids=[],[]

for i in val_data:
    output = pipe(i["input"])
    output = merge_consecutive_entities(output, i["input"])
    NER_spans = get_spans(output)
    spans.append(list(set(NER_spans)))
print(spans)

[{'input': 'Nevoid basal cell carcinoma syndrome (NBCCS) is rare in black persons. We describe an 11-year-old black boy with NBCCS who presented with exotropia and a painful, expanding, cystic mass in the left posterior alveolar ridge. Further examination revealed odontogenic keratocysts with palmar and plantar pitting. Less than 5% of reported patients with NBCCS are black. To our knowledge, this is the first report of a black patient with NBCCS presenting with exotropia and an impacted molar displaced into the orbit by an odontogenic keratocyst.', 'output': ' basal cell carcinoma | HP_0002671\n exotropia | HP_0000577\n cystic mass in the left posterior alveolar ridge | HP_0006477\n odontogenic keratocysts | HP_0010603\n palmar and plantar pitting | HP_0010612\n END'}]
data loaded
[[' odontogenic keratocysts', ' NBCCS', 'cystic mass', 'Nevoid basal cell carcinoma syndrome', 'CS', 'NBCCS', 'plantar pitting', ' odontogenic keratocyst']]
