In [1]:
import numpy as np
import json
from typing import Optional
import matplotlib.pyplot as plt
import pandas as pd
from openai import OpenAI

client = OpenAI(api_key="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

In [11]:

def parse_hpo_output(text):
    text = text.replace("END", "").strip()  # Remove END marker
    terms = text.split("\n")  # Split lines in the ground truth
    parsed = []
    for term in terms:
        term = term.strip()
        if "|" in term:  # Process only lines with HPO terms
            name, hpo_id = term.split("|")
            name = name.strip().lower()  # Normalize name
            hpo_id = hpo_id.strip().upper()  # Normalize ID
            parsed.append((name, hpo_id))
    return set(parsed)
def parse_gpt_output(text):
    text = text.strip()
    terms = text.split(",")  # Split by commas for predictions
    parsed = []
    for term in terms:
        term = term.strip()
        if "(" in term and ")" in term:
            # Split only on the first "(" to avoid unpacking issues
            name, hpo_id = term.split("(", 1)
            parsed.append((name.strip().lower(), hpo_id.strip(")").strip().upper()))
    return set(parsed)

def hp_ids(val_eval_data, gpt_val_data, bool=1):
    val_ids, gpt_ids = [], []

    for truth, pred in zip(val_eval_data, gpt_val_data):
        # Parse ground truth and predictions
        true_hpo = parse_hpo_output(truth)
        true_hpo = [hp for _, hp in true_hpo]

        if bool:
            post_hpo = parse_gpt_output(pred)
            pred_hpo = [hp for _, hp in post_hpo]
        else:
            pred_hpo = set(pred)
            pred_hpo = [word for entry in pred_hpo for word in entry.split() if word.startswith("HP:")]
            pred_hpo = [item.replace(':', '_') for item in pred_hpo]

        val_ids.append(true_hpo)
        gpt_ids.append(pred_hpo)

    return val_ids, gpt_ids

def metric(val_ids,gpt_ids):
    # Calculate metrics per sample
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for true_hpo, pred_hpo in zip(val_ids, gpt_ids):
        # Calculate TP, FP, FN
        true_hpo=set(true_hpo)
        pred_hpo=set(pred_hpo)
        # print(true_hpo)
        # print(pred_hpo)
        tp = len(true_hpo & pred_hpo)
        fp = len(pred_hpo - true_hpo)
        fn = len(true_hpo - pred_hpo)

        # Precision, Recall, F1 for this sample
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Append metrics for this sample
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Average metrics across all samples
    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)
    avg_f1 = sum(f1_scores) / len(f1_scores)

    # Print results
    print(f"Total Samples: {len(val_ids)}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average F1 Score: {avg_f1:.4f}")
    
    return avg_precision, avg_recall, avg_f1

def create_finetune_job(client, training: str, validation: Optional[str] = None, model_name: str = "gpt-4", epoch: int = 4):
    job_params = {
        "training_file": training,
        "model": model_name,
        "hyperparameters": {"n_epochs": epoch, "batch_size": 1}
    }
    if validation:  # Add validation file only if provided
        job_params["validation_file"] = validation
    
    # Create the fine-tuning job
    response = client.fine_tuning.jobs.create(**job_params)
    return response

def load_val_data(val_data_path):
    # Load validation data
    with open(val_data_path, "r") as f:
        val_data = json.load(f)

    val_eval_data = []  
    for i in val_data:
        val_eval_data.append(i["output"])
    return val_data,val_eval_data

def load_gpt_val_data(val_data, client,model_id):
    gpt_val_data=[]
    # Perform inference and calculate metrics for each sample
    for entry in val_data:
        # Simulate model prediction by passing input through your GPT model
        response = client.chat.completions.create(
            model=model_id,  # Replace with your fine-tuned model ID
            messages=[
                {
                    "role": "system",
                    "content": "You are an assistant specialized in extracting phenotype terms and their corresponding HPO IDs from clinical notes."
                },
                {
                    "role": "user",
                    "content": f"Extract relevant phenotype terms and their HPO IDs from the following clinical note:\n{entry['input']}"
                }
            ]
        )
        gpt_val_data.append(response.choices[0].message.content)
    return gpt_val_data

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def get_top_k_similarities(embeddings, query, k):
    similarities = np.array([cosine_similarity(query, embedding) for embedding in embeddings])
    top_k_indices = similarities.argsort()[-k:][::-1]
    return top_k_indices, similarities[top_k_indices]

def post_gpt(gpt_val_data): 
    # Load the HPO dictionary
    hpo_dict = {}
    with open("/2022/hpo2022.txt",encoding='utf-8') as f:
        for line in f:
            if ":H" in line:  # Only process lines containing ':H'
                key, val = line.split(":H", 1)  # Split only at the first ':H'
                val = val.rstrip()  # Remove newline characters
                hpo_dict[key] = "H" + val

    # Load embeddings from CSV
    df = pd.read_csv('/2022/HPO_embeddings_38k.csv')
    db = df.to_numpy()

    # Parse GPT predictions
    r = []
    for pred in gpt_val_data:
        pred_hpo = parse_gpt_output(pred)
        r.append(pred_hpo)

    # Prepare GPT embeddings
    gpt_embeddings = []
    for i in r:
        temp = []
        for j in i:
            temp.append(j)
        gpt_embeddings.append(list(set(temp)))

    gpt_embeddings = [[term[0] for term in sublist if len(term[0]) > 3] for sublist in gpt_embeddings]

    # Calculate total number of GPT embeddings
    gpt_sum = 0
    for i in gpt_embeddings:
        gpt_sum += len(i)

    # Initialize embedding matrix for GPT
    gpt_embeddings_197 = np.zeros((gpt_sum, 1536))
    g = 0

    # Create embeddings for GPT data
    for i in range(len(gpt_embeddings)):
        for j in range(len(gpt_embeddings[i])):
            response = client.embeddings.create(
                input=gpt_embeddings[i][j],
                model="text-embedding-3-small"
            )
            gpt_embeddings_197[g] = response.data[0].embedding
            g += 1
            if g % 50 == 0:
                print(g)

    print("-----------------indices-----------------")

    # Get top indices for GPT embeddings
    top_gpt_indices = []
    for i in range(gpt_sum):
        top_gpt_indices.append(get_top_k_similarities(db, gpt_embeddings_197[i], 1)[0][0])
        if i % 50 == 0:
            print(i)

    # Retrieve GPT terms based on top indices
    gpt_retrieved_terms = []
    for i in top_gpt_indices:
        gpt_retrieved_terms.append(list(hpo_dict.keys())[i] + " " + list(hpo_dict.values())[i])

    # Separate GPT terms based on counts
    gpt_no_ids = []
    for i in gpt_embeddings:
        gpt_no_ids.append(len(i))

    gpt_sep = []
    gpt_i = 0
    for i in gpt_no_ids:
        gpt_sep.append(list(set(gpt_retrieved_terms[gpt_i:gpt_i + i])))
        gpt_i += i

    return gpt_sep


Biolark-GSC

In [12]:
val_data_path = "/Evaluation/datasets/biolark_val.json"
model="ft:gpt-4o-mini-2024-07-18:iisc-bangalore::AYf5TC9S"
val_data, eval_data = load_val_data(val_data_path)
gpt_val_data = load_gpt_val_data(val_data, client, model)
val_ids, gpt_ids = hp_ids(eval_data, gpt_val_data)
precision, recall, f1 = metric(val_ids, gpt_ids)
print("Post processing: embeddings")
gpt_sep = post_gpt(gpt_val_data)
val_ids, gpt_ids = hp_ids(eval_data, gpt_sep, 0)
precision, recall, f1 = metric(val_ids, gpt_ids)


Total Samples: 23
Average Precision: 0.9588
Average Recall: 0.9578
Average F1 Score: 0.9581
Post processing: embeddings
50
100
150
-----------------indices-----------------
0
50
100
150
Total Samples: 23
Average Precision: 0.6725
Average Recall: 0.6541
Average F1 Score: 0.6626


ID68

In [13]:
val_data_path = "/Evaluation/datasets/ID_68.json"
model="ft:gpt-4o-mini-2024-07-18:iisc-bangalore::AYf5TC9S"
val_data, eval_data = load_val_data(val_data_path)
gpt_val_data = load_gpt_val_data(val_data, client, model)
val_ids, gpt_ids = hp_ids(eval_data, gpt_val_data)
precision, recall, f1 = metric(val_ids, gpt_ids)
print("Post processing: embeddings")
gpt_sep = post_gpt(gpt_val_data)
val_ids, gpt_ids = hp_ids(eval_data, gpt_sep, 0)
precision, recall, f1 = metric(val_ids, gpt_ids)


Total Samples: 68
Average Precision: 0.1134
Average Recall: 0.1464
Average F1 Score: 0.1247
Post processing: embeddings
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
-----------------indices-----------------
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
Total Samples: 68
Average Precision: 0.6173
Average Recall: 0.7275
Average F1 Score: 0.6503
