In [None]:
import ast
import json
import openai
import numpy as np
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
# from sentence_transformers.util import cos_sim
from typing import List, Tuple, Optional, Dict
# from sentence_transformers import SentenceTransformer

In [None]:
# Read data
review_df = pd.read_csv("data_extracted/extracted_reviews_raw10.csv")
merged_df_embedded = pd.read_csv("data_embedded/embedded_ae.csv")

In [None]:
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

In [None]:
# Load AEs and embeddings
ae_texts = merged_df_embedded['AE'].tolist()
ae_embeddings = merged_df_embedded[[f'emb_{i}' for i in range(768)]].dropna().values

In [None]:
# Extract side effect names
def extract_side_effects(structured_info_str: str) -> List[str]:
    try:
        info = ast.literal_eval(structured_info_str)
        side_effects = info.get("side_effects", [])
        return [se.get("name", "").strip() for se in side_effects if se.get("name")]
    except Exception as e:
        print(f"[extract_side_effects] Error: {e}")
        return []

In [None]:
# Find top matches
def find_top_matches(effect: str, model, ae_texts, ae_embeddings, threshold: float = 0.8) -> List[str]:
    try:
        query_emb = model.encode(effect, convert_to_numpy=True, normalize_embeddings=True)
        similarities = np.dot(ae_embeddings, query_emb)
        top_indices = similarities.argsort()[::-1]
        top_matches = [ae_texts[i] for i in top_indices if similarities[i] >= threshold]
        return top_matches[:10]
    except Exception as e:
        print(f"[find_top_matches] Error for '{effect}': {e}")
        return []

In [None]:
openai_key = ""
openai_client = OpenAI(api_key=openai_key)

In [None]:
GPT_MATCHING_PROMPT = (
    "You are a clinical NLP assistant helping map patient-reported side effects to standardized adverse events (AEs).\n\n"
    "A user has reported the side effect: '{side_effect}'\n\n"
    "Below are several standardized AEs. Your task is to choose the one that most accurately matches the reported term.\n\n"
    "Only choose an AE if it clearly matches the meaning of the reported term.\n"
    "Do NOT choose an AE based on partial overlap or general similarity. For example, 'sickness' and 'morning sickness' are NOT the same.\n\n"
    "{ae_list}\n\n"
    "If none of the options is an unambiguous match, reply with an empty string.\n\n"
    "Best matching AE:"
)

In [None]:
# GPT-based resolution
def gpt_resolve_match(side_effect: str, ae_candidates: List[str]) -> Optional[str]:
    if not ae_candidates:
        return None

    formatted_ae_list = "\n".join(f"- {ae}" for ae in ae_candidates)
    formatted_prompt = GPT_MATCHING_PROMPT.format(
        side_effect=side_effect,
        ae_list=formatted_ae_list
    )

    try:
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": formatted_prompt}],
            temperature=0.0,
            max_tokens=100
        )
        result = response.choices[0].message.content.strip()
        return result if result and result in ae_candidates else None
    except Exception as e:
        print(f"[gpt_resolve_match] Error for '{side_effect}': {e}")
        return None

In [None]:
# Replace in original structured_info
def update_structured_info(structured_info_str: str, replacements: Dict[str, str]) -> str:
    try:
        info = ast.literal_eval(structured_info_str)
        for se in info.get("side_effects", []):
            name = se.get("name", "").strip()
            if name in replacements:
                se["name"] = replacements[name]
        return json.dumps(info)
    except Exception as e:
        print(f"[update_structured_info] Error: {e}")
        return structured_info_str

In [None]:
# Main processor
def process_row(row, model, ae_texts, ae_embeddings, threshold: float = 0.4) -> str:
    original_info = row["structured_info"]
    effects = extract_side_effects(original_info)
    replacements = {}

    for effect in effects:
        candidates = find_top_matches(effect, model, ae_texts, ae_embeddings, threshold)
        best_match = gpt_resolve_match(effect, candidates)
        if best_match:
            replacements[effect] = best_match.lower()

    return update_structured_info(original_info, replacements)

In [None]:
tqdm.pandas(desc="Processing rows")
review_df["new_structured_info"] = review_df.progress_apply(
    lambda row: process_row(row, model, ae_texts, ae_embeddings), axis=1
)

review_df.to_csv("WebMD/combined_extracted_reviews_with_matched_AEs.csv", index=False)

Processing rows: 100%|████████████████████████| 617/617 [12:15<00:00,  1.19s/it]


In [None]:
review_df["new_structured_info"]

0      {"drug": {"name": "Mounjaro", "dosage": null, ...
1      {"drug": {"name": "Mounjaro", "dosage": "10", ...
2      {"drug": {"name": "Mounjaro", "dosage": null, ...
3      {"drug": {"name": "Mounjaro", "dosage": "10mg"...
4      {"drug": {"name": "Mounjaro", "dosage": "2.5mg...
                             ...                        
612    {"drug": {"name": "Mounjaro", "dosage": "2.5 m...
613    {"drug": {"name": ["Victoza", "Mounjaro"], "do...
614    {"drug": {"name": "Mounjaro", "dosage": null, ...
615    {"drug": {"name": "Mounjaro", "dosage": "5mg",...
616    {"drug": {"name": ["Mounjaro", "Ozempic"], "do...
Name: new_structured_info, Length: 617, dtype: object

In [None]:
review_df.to_csv("WebMD/combined_extracted_reviews_with_matched_AEs.csv", index=False)