In [1]:
import getpass
import os
import pandas as pd
from langchain_openai import OpenAIEmbeddings

In [2]:
# OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")

In [3]:
df = pd.read_csv("dataset/diseases_symptoms.tsv", sep="\t")

In [4]:
df.head()

Unnamed: 0,disease,symptoms
0,Postoperative Complications,"Abdomen, Abdominal Pain, Abnormal, Acquired, A..."
1,Brain Diseases,"Abdomen, Abdominal Pain, Abnormal, Acquired, A..."
2,Cerebral Infarction,"Abdominal Pain, Abnormal, Acquired, Acute Coro..."
3,Pain,"Abdomen, Abdominal Pain, Abnormal, Acute, Acut..."
4,Epilepsy,"Abdomen, Abdominal Pain, Abnormal, Acute, Agno..."


In [5]:
symptoms_each_disease = df["symptoms"].tolist()

In [6]:
import re

def clean_symptoms(symptoms_list):
  cleaned_symptoms = []
  for symptoms in symptoms_list:
      if isinstance(symptoms, list):
          symptoms = ','.join(symptoms)
      cleaned = re.sub(r',+', ',', symptoms).strip(', ').strip()
      if (len(cleaned) == 0):
        print("true")
      cleaned_symptoms.append(cleaned)

  return cleaned_symptoms

def seperate_symptoms(symptoms_each_disease):
  return [symptoms.split(', ') for symptoms in symptoms_each_disease]

cleaned_symptoms_each_disease = clean_symptoms(symptoms_each_disease)

seperated_symptoms_each_disease = seperate_symptoms(cleaned_symptoms_each_disease)


seperated_symptoms_each_disease[0]

['Abdomen',
 'Abdominal Pain',
 'Abnormal',
 'Acquired',
 'Acute',
 'Acute Coronary Syndrome',
 'Aerophagy',
 'Ageusia',
 'Agnosia',
 'Agraphia',
 'Akathisia',
 'Albuminuria',
 'Amblyopia',
 'Amnesia',
 'Angina',
 'Angina Pectoris',
 'Anisocoria',
 'Anomia',
 'Anorexia',
 'Anoxia',
 'Anterograde',
 'Anticipatory',
 'Aphasia',
 'Aphonia',
 'Apnea',
 'Apraxia',
 'Apraxias',
 'Arthralgia',
 'Articulation Disorders',
 'Asthenia',
 'Ataxia',
 'Athetosis',
 'Auditory Perceptual Disorders',
 'Babinski',
 'Back Pain',
 'Bilateral',
 'Birth Weight',
 'Blindness',
 'Body Weight',
 'Broca',
 'Brown-Sequard Syndrome',
 'Bulimia',
 'Cachexia',
 'Cardiac',
 'Cardiac Output',
 'Catatonia',
 'Central',
 'Cerebellar Ataxia',
 'Cerebrospinal Fluid Otorrhea',
 'Cerebrospinal Fluid Rhinorrhea',
 'Chest Pain',
 'Cheyne-Stokes Respiration',
 'Chills',
 'Chorea',
 'Colic',
 'Color Vision Defects',
 'Coma',
 'Communication Disorders',
 'Conduction',
 'Conductive',
 'Confusion',
 'Consciousness Disorders',
 'C

## Export Disease Symptoms to TSV


In [7]:
# Extract disease names from the dataframe
disease_names = df['disease'].tolist() if 'disease' in df.columns else ['Disease_' + str(i) for i in range(len(seperated_symptoms_each_disease))]

# Verify we have the same number of diseases and symptom lists
print(f"Number of diseases: {len(disease_names)}")
print(f"Number of symptom lists: {len(seperated_symptoms_each_disease)}")

Number of diseases: 4219
Number of symptom lists: 4219


In [8]:
# Create a new dataframe with disease names and their symptoms
export_data = []

for i, (disease, symptoms) in enumerate(zip(disease_names, seperated_symptoms_each_disease)):
    # Clean up the symptoms (remove any leading/trailing spaces)
    cleaned_symptoms = [symptom.strip() for symptom in symptoms]
    export_data.append({
        'disease': disease,
        'symptoms': cleaned_symptoms
    })

# Convert to dataframe
export_df = pd.DataFrame(export_data)
export_df.head()

Unnamed: 0,disease,symptoms
0,Postoperative Complications,"[Abdomen, Abdominal Pain, Abnormal, Acquired, ..."
1,Brain Diseases,"[Abdomen, Abdominal Pain, Abnormal, Acquired, ..."
2,Cerebral Infarction,"[Abdominal Pain, Abnormal, Acquired, Acute Cor..."
3,Pain,"[Abdomen, Abdominal Pain, Abnormal, Acute, Acu..."
4,Epilepsy,"[Abdomen, Abdominal Pain, Abnormal, Acute, Agn..."


In [9]:
# Function to convert symptom list to TSV-friendly string
def format_symptoms(symptoms_list):
    return '|'.join(symptoms_list)

# Apply formatting to symptoms column
export_df['symptoms_formatted'] = export_df['symptoms'].apply(format_symptoms)

# Create final export dataframe
final_export_df = pd.DataFrame({
    'disease': export_df['disease'],
    'symptoms': export_df['symptoms_formatted']
})

# Export to TSV file
output_path = 'dataset/diseases_symptoms_processed.tsv'
final_export_df.to_csv(output_path, sep='\t', index=False)

print(f"Data successfully exported to {output_path}")

Data successfully exported to dataset/diseases_symptoms_processed.tsv


In [10]:
# Verify the exported data
pd.read_csv(output_path, sep='\t').head()

Unnamed: 0,disease,symptoms
0,Postoperative Complications,Abdomen|Abdominal Pain|Abnormal|Acquired|Acute...
1,Brain Diseases,Abdomen|Abdominal Pain|Abnormal|Acquired|Acute...
2,Cerebral Infarction,Abdominal Pain|Abnormal|Acquired|Acute Coronar...
3,Pain,Abdomen|Abdominal Pain|Abnormal|Acute|Acute Co...
4,Epilepsy,Abdomen|Abdominal Pain|Abnormal|Acute|Agnosia|...
