In [None]:
import re
import ast
import json
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
from openai import OpenAI
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import cos_sim

## Create an AE mapping list with embeddings

In [None]:
tirz_df = pd.read_excel('FAERS/Cleaned_Tirzepatide_AEs.xlsx')
sema_df = pd.read_excel('FAERS/Cleaned_Semaglutide_AEs.xlsx')
ozem_df = pd.read_excel('FAERS/Cleaned_Ozempic_AEs.xlsx')
rybe_df = pd.read_excel('FAERS/Cleaned_Rybelsus_AEs.xlsx')
wego_df = pd.read_excel('FAERS/Cleaned_Wegovy_AEs.xlsx')
zepb_df = pd.read_excel('FAERS/Cleaned_Zepbound_AEs.xlsx')
moun_df = pd.read_excel('FAERS/Cleaned_Mounjaro_AEs.xlsx')
lira_df = pd.read_excel('FAERS/Cleaned_Liraglutide_AEs.xlsx')
saxe_df = pd.read_excel('FAERS/Cleaned_Saxenda_AEs.xlsx')
vict_df = pd.read_excel('FAERS/Cleaned_Victoza_AEs.xlsx')

merged_df = pd.concat([tirz_df, sema_df, ozem_df, rybe_df, wego_df, zepb_df, moun_df, lira_df, saxe_df, vict_df], ignore_index=True)

merged_df = merged_df.drop_duplicates(subset=['AE', 'Reaction Group'])

merged_df.to_excel('FAERS/Merged_AEs.xlsx', index=False)

In [None]:
merged_df.shape

(5744, 2)

In [None]:
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
ae_texts = merged_df['AE'].dropna().astype(str).str.strip().tolist()

In [None]:
embeddings = []
for ae in tqdm(ae_texts, desc="Embedding AE terms"):
    emb = model.encode(ae, convert_to_numpy=True, normalize_embeddings=True)  # normalize for cosine sim
    embeddings.append(emb)

Embedding AE terms: 100%|███████████████████| 5744/5744 [02:56<00:00, 32.45it/s]


In [None]:
embedding_df = pd.DataFrame(embeddings, columns=[f'emb_{i}' for i in range(embeddings[0].shape[0])])
merged_df_clean = merged_df[merged_df['AE'].notna() & merged_df['AE'].str.strip().astype(bool)].reset_index(drop=True)
merged_df_embedded = pd.concat([merged_df_clean, embedding_df], axis=1)

In [None]:
merged_df_embedded

Unnamed: 0,AE,Reaction Group,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,Incorrect Dose Administered,"Injury, Poisoning And Procedural Complications",0.046186,0.024490,-0.034293,-0.059262,0.045288,-0.036910,-0.005737,0.018988,...,-0.031021,0.000022,0.012232,0.033662,-0.009992,0.062149,0.033643,-0.056486,0.074496,0.032124
1,Off Label Use,"Injury, Poisoning And Procedural Complications",-0.005615,0.008708,0.001345,-0.042422,0.023932,-0.014669,-0.031821,0.028706,...,-0.068366,0.029713,-0.004785,-0.034710,-0.001396,-0.013581,-0.018514,-0.034237,0.029932,0.015402
2,Extra Dose Administered,"Injury, Poisoning And Procedural Complications",0.030635,0.098102,-0.016347,-0.042228,0.025645,-0.025870,-0.003545,-0.040589,...,-0.033688,-0.019320,0.017733,0.003615,-0.010279,0.039942,0.016930,-0.029881,0.027895,0.034040
3,Accidental Underdose,"Injury, Poisoning And Procedural Complications",-0.029426,0.039857,-0.067074,0.005482,0.033095,-0.044396,0.020642,0.041145,...,-0.004650,-0.064106,-0.034380,-0.004744,-0.047589,0.047966,0.014657,-0.012151,0.041776,0.029654
4,Product Dose Omission Issue,"Injury, Poisoning And Procedural Complications",0.018886,0.040606,-0.000082,0.003330,0.014169,-0.004107,-0.048625,0.010183,...,-0.071543,-0.005572,-0.021472,0.001818,0.019457,0.022574,-0.001305,-0.070922,0.030041,-0.006214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739,Sirenomelia,"Congenital, Familial And Genetic Disorders",-0.031486,0.002704,-0.042443,0.003822,-0.015004,0.006105,-0.026098,-0.026919,...,0.013012,-0.008273,0.034310,-0.051014,0.014211,-0.037771,0.030668,0.040231,0.012727,0.021520
5740,Cystinuria,"Congenital, Familial And Genetic Disorders",0.008056,0.042917,-0.005139,-0.006972,0.019351,-0.039403,0.043138,0.013969,...,-0.019727,-0.008444,-0.009018,0.049898,-0.093023,0.008214,-0.002954,0.058993,0.026646,-0.022825
5741,Hemihypertrophy,"Congenital, Familial And Genetic Disorders",-0.075614,0.091777,-0.037417,0.038240,-0.033098,-0.025338,0.018138,-0.046948,...,-0.032225,0.009252,0.027699,0.051222,0.043186,0.002460,-0.059668,0.019453,-0.004990,0.031824
5742,Primary Immunodeficiency Syndrome,"Congenital, Familial And Genetic Disorders",-0.011655,0.019530,-0.052403,-0.030324,-0.004203,0.037987,-0.040596,-0.016858,...,-0.028685,-0.011615,0.045531,0.058467,-0.002097,0.042315,0.020588,0.069731,0.020659,0.007519


In [None]:
merged_df_embedded.to_csv("FAERS/Merged_AEs_with_SentenceBioBERT.csv", index=False)

## Match side effects with AEs

In [None]:
review_df = pd.read_csv("WebMD/combined_extracted_reviews.csv")
merged_df_embedded = pd.read_csv("FAERS/Merged_AEs_with_SentenceBioBERT.csv")

In [None]:
review_df['structured_info'] = review_df['structured_info'].apply(ast.literal_eval)

side_effects = []
for row in review_df['structured_info']:
    for item in row.get('side_effects', []):
        name = item.get('name')
        if name:
            side_effects.append(name.strip())

unique_side_effects = list(set(side_effects))

In [None]:
unique_side_effects

['mild stomach upset',
 'nerve pain',
 'Sudden drop in glucose',
 'vomitting',
 'specific food cravings',
 'feeling full',
 'general uneasiness',
 'muscle and joint pain',
 'trouble controlling tantrums',
 'injection site soreness',
 'appetite loss',
 'itching',
 'itchy welt at injection site',
 'severe constipation',
 'skin sensitivity',
 'tummy pains',
 'extreme constipation',
 'difficulty swallowing',
 'heart burn',
 'allergic reaction',
 'diarrhea or constipation',
 'tired',
 'itchy swelling red at injection site',
 'no appetite',
 'heat burn',
 'violent vomiting',
 'Stomach upset',
 'extreme vaginal bleeding',
 'less energy',
 'Feeling bad',
 'cold sweats',
 'joint and muscle pains',
 'dry heaving',
 'nausia',
 'nausea',
 'Reduced appetite',
 'nauseous feeling',
 'little hunger',
 'weakness',
 'projectile vomiting',
 'slight hunger',
 'burping (sulfer smell)',
 'stomach craps',
 'small bumps on the body',
 'muscle aches',
 'Muscle aches/pain',
 'Nerve Pain',
 'Vomiting',
 'lower b

In [None]:
ae_texts = merged_df_embedded['AE'].tolist()
ae_embeddings = merged_df_embedded[[f'emb_{i}' for i in range(768)]].dropna().values

In [None]:
results = []
threshold = 0

for effect in unique_side_effects:
    try:
        query_emb = model.encode(effect, convert_to_numpy=True, normalize_embeddings=True)

        similarities = np.dot(ae_embeddings, query_emb)

        top_indices = similarities.argsort()[::-1]
        top_matches = [(ae_texts[i], similarities[i]) for i in top_indices if similarities[i] >= threshold]
        top_matches = top_matches[:10]

        results.append({
            'Extracted Side Effect': effect,
            'Top Matches': top_matches
        })
    except Exception as e:
        results.append({
            'Extracted Side Effect': effect,
            'Top Matches': f"Error: {e}"
        })

In [None]:
for res in results[:20]:
    print(f"\nOriginal: {res['Extracted Side Effect']}")
    if isinstance(res['Top Matches'], str):
        print(f"  {res['Top Matches']}")
    elif not res['Top Matches']:
        print("  No match with similarity > 0.4")
    else:
        for match, score in res['Top Matches']:
            print(f"  Match: {match} (Similarity: {score:.4f})")


Original: mild stomach upset
  Match: Epigastric Discomfort (Similarity: 0.6286)
  Match: Dyspepsia (Similarity: 0.5667)
  Match: Gastrointestinal Tract Irritation (Similarity: 0.5583)
  Match: Stomach Mass (Similarity: 0.5531)
  Match: Gastric Hypomotility (Similarity: 0.5405)
  Match: Gastric Mucosa Erythema (Similarity: 0.5290)
  Match: Obstruction Gastric (Similarity: 0.5248)
  Match: Gastrointestinal Pain (Similarity: 0.5201)
  Match: Gastric Hypermotility (Similarity: 0.5199)
  Match: Gastrointestinal Hypomotility (Similarity: 0.5168)

Original: nerve pain
  Match: Nerve Compression (Similarity: 0.8377)
  Match: Nerve Injury (Similarity: 0.7590)
  Match: Peripheral Sensory Neuropathy (Similarity: 0.7265)
  Match: Neuropathy Peripheral (Similarity: 0.7166)
  Match: Spinal Pain (Similarity: 0.7144)
  Match: Neuralgia (Similarity: 0.7005)
  Match: Neck Pain (Similarity: 0.6906)
  Match: Pain (Similarity: 0.6873)
  Match: Sciatic Nerve Neuropathy (Similarity: 0.6770)
  Match: Radicu