In [85]:
# Import Dependencies
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Link to dataset: https://impact.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html

In [125]:
# reading the raw dataset obtained from Disease-Symptom Knowledge database from Columbia
raw_df = pd.read_csv("columbia_disease_symptom.csv", encoding='latin-1')

In [126]:
# checking the column names
print(raw_df.columns)
print(raw_df["Symptom"].unique())
len(raw_df["Symptom"].unique())

Index(['Disease', 'Count of Disease Occurrence', 'Symptom'], dtype='object')
['UMLS:C0008031_pain chest' 'UMLS:C0392680_shortness of breath'
 'UMLS:C0012833_dizziness' 'UMLS:C0004093_asthenia' 'UMLS:C0085639_fall'
 'UMLS:C0039070_syncope' 'UMLS:C0042571_vertigo'
 'UMLS:C0038990_sweat^UMLS:C0700590_sweating increased'
 'UMLS:C0030252_palpitation' 'UMLS:C0027497_nausea'
 'UMLS:C0002962_angina pectoris' 'UMLS:C0438716_pressure chest'
 'UMLS:C0032617_polyuria' 'UMLS:C0085602_polydypsia'
 'UMLS:C0085619_orthopnea' 'UMLS:C0034642_rale'
 'UMLS:C0241526_unresponsiveness' 'UMLS:C0856054_mental status changes'
 'UMLS:C0042963_vomiting' 'UMLS:C0553668_labored breathing'
 'UMLS:C0424000_feeling suicidal' 'UMLS:C0438696_suicidal'
 'UMLS:C0233762_hallucinations auditory' 'UMLS:C0150041_feeling hopeless'
 'UMLS:C0424109_weepiness' 'UMLS:C0917801_sleeplessness'
 'UMLS:C0424230_motor retardation' 'UMLS:C0022107_irritable mood'
 'UMLS:C0312422_blackout' 'UMLS:C0344315_mood depressed'
 'UMLS:C0233763_hal

402

Pre-Processing

In [127]:
# filling all the NaN s with the values above
raw_df = raw_df.fillna(method='ffill')

# renaming the columns
raw_df.rename(columns={"Count of Disease Occurrence": "disease_occurrence_count", "Disease":"disease", "Symptom":"symptom"}, inplace=True)

In [128]:
# defining a function "split_column_values" to split 'Symptom' column and creating 'symptom_code' and 'Symptom' columns
def split_column_values(x_column):
    codes_and_values = x_column.split('^')
    values = [x.split('_')[-1] for x in codes_and_values]
    codes = [x.split('_')[0] for x in codes_and_values]
    value = '^'.join(values)
    code = '^'.join(codes)
    return value, code

In [129]:
# applying the function to the column
raw_df[['disease', 'disease_code']] = raw_df['disease'].apply(split_column_values).apply(pd.Series)

# replacing the spaces with underscores in all values in the disease column
raw_df['disease'] = raw_df['disease'].str.replace(' ', '_')

In [130]:
# defining a function "split_symptom" to split 'Symptom' column and creating 'symptom_code' and 'Symptom' columns
def split_symptom(symptom):
    codes_and_symptoms = symptom.split('^')
    symptom_parts = [x.split('_')[-1] for x in codes_and_symptoms]
    codes = [x.split('_')[0] for x in codes_and_symptoms]
    symptom = '^'.join(symptom_parts)
    symptom_code = '^'.join(codes)
    return symptom, symptom_code

# applying the function to the column
raw_df[['symptom', 'symptom_code']] = raw_df['symptom'].apply(split_symptom).apply(pd.Series)

# Replace spaces with underscores in all values in the 'Symptom' column
raw_df['symptom'] = raw_df['symptom'].str.replace(' ', '_')


In [131]:
raw_df.head()

Unnamed: 0,disease,disease_occurrence_count,symptom,disease_code,symptom_code
0,hypertensive_disease,3363.0,pain_chest,UMLS:C0020538,UMLS:C0008031
1,hypertensive_disease,3363.0,shortness_of_breath,UMLS:C0020538,UMLS:C0392680
2,hypertensive_disease,3363.0,dizziness,UMLS:C0020538,UMLS:C0012833
3,hypertensive_disease,3363.0,asthenia,UMLS:C0020538,UMLS:C0004093
4,hypertensive_disease,3363.0,fall,UMLS:C0020538,UMLS:C0085639


In [132]:
print(raw_df["symptom"].unique())
# some of the symptoms have string "\xa0" and "\xc2" in between them, we need to replace them with "_"

raw_df["symptom"] = raw_df["symptom"].str.replace("\xa0|\xc2", "_", regex=True)

['pain_chest' 'shortness_of_breath' 'dizziness' 'asthenia' 'fall'
 'syncope' 'vertigo' 'sweat^sweating_increased' 'palpitation' 'nausea'
 'angina_pectoris' 'pressure_chest' 'polyuria' 'polydypsia' 'orthopnea'
 'rale' 'unresponsiveness' 'mental_status_changes' 'vomiting'
 'labored_breathing' 'feeling_suicidal' 'suicidal'
 'hallucinations_auditory' 'feeling_hopeless' 'weepiness' 'sleeplessness'
 'motor_retardation' 'irritable_mood' 'blackout' 'mood_depressed'
 'hallucinations_visual' 'worry' 'agitation' 'tremor' 'intoxication'
 'verbal_auditory_hallucinations' 'energy_increased' 'difficulty'
 'nightmare' 'unable_to_concentrate' 'homelessness' 'hypokinesia'
 'dyspnea_on_exertion' 'chest_tightness' 'cough' 'fever'
 'decreased_translucency' 'productive_cough' 'pleuritic_pain'
 'yellow_sputum' 'breath_sounds_decreased' 'chill' 'rhonchus'
 'green_sputum' 'non-productive_cough' 'wheezing' 'haemoptysis'
 'distress_respiratory' 'tachypnea' 'malaise' 'night_sweat'
 'jugular_venous_distention' 'dy

In [133]:
# dataframe information
raw_df.info()
print("The dataframe consists a total of",raw_df["disease"].nunique(), "diseases")
print( "The dataframe has a total of", raw_df["symptom"].nunique(), "symptoms")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1867 entries, 0 to 1866
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   disease                   1867 non-null   object 
 1   disease_occurrence_count  1867 non-null   float64
 2   symptom                   1867 non-null   object 
 3   disease_code              1867 non-null   object 
 4   symptom_code              1867 non-null   object 
dtypes: float64(1), object(4)
memory usage: 73.1+ KB
The dataframe consists a total of 134 diseases
The dataframe has a total of 401 symptoms


In [134]:
# creating a new dataframe with once disease and symptoms 
processed_df = raw_df[["disease","symptom"]]
processed_df.head(20)

Unnamed: 0,disease,symptom
0,hypertensive_disease,pain_chest
1,hypertensive_disease,shortness_of_breath
2,hypertensive_disease,dizziness
3,hypertensive_disease,asthenia
4,hypertensive_disease,fall
5,hypertensive_disease,syncope
6,hypertensive_disease,vertigo
7,hypertensive_disease,sweat^sweating_increased
8,hypertensive_disease,palpitation
9,hypertensive_disease,nausea


In [143]:
# one hot encoding the symptoms column
onehot_encoded = pd.get_dummies(processed_df['symptom'], prefix='', prefix_sep='').drop('', axis=1)

print(onehot_encoded)

      Heberden's_node  Murphy's_sign  Stahli's_line  abdomen_acute  \
0                   0              0              0              0   
1                   0              0              0              0   
2                   0              0              0              0   
3                   0              0              0              0   
4                   0              0              0              0   
...               ...            ...            ...            ...   
1862                0              0              0              0   
1863                0              0              0              0   
1864                0              0              0              0   
1865                0              0              0              0   
1866                0              0              0              0   

      abdominal_bloating  abdominal_tenderness  abnormal_sensation  \
0                      0                     0                   0   
1                  

In [144]:
# Concatenate the one-hot encoded columns with the original DataFrame
processed_df_encoded = pd.concat([processed_df['disease'], onehot_encoded], axis=1)

In [145]:
# Group by 'disease' and sum the one-hot encoded values
final_df = processed_df_encoded.groupby('disease').sum()

In [147]:
# Step 4: Reset the index to make 'disease' a regular column
final_df.reset_index(inplace=True)

# Step 5: Display the final DataFrame
print("Final Encoded DataFrame:")
final_df

Final Encoded DataFrame:


Unnamed: 0,disease,Heberden's_node,Murphy's_sign,Stahli's_line,abdomen_acute,abdominal_bloating,abdominal_tenderness,abnormal_sensation,abnormally_hard_consistency,abortion,...,vision_blurred,vomiting,weepiness,weight_gain,welt,wheelchair_bound,wheezing,withdraw,worry,yellow_sputum
0,Alzheimer's_disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,Pneumocystis carinii pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,accident cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,acquired immuno-deficiency_syndrome^HIV^hiv_in...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,adenocarcinoma,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,tonic-clonic_epilepsy^tonic-clonic_seizures,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130,transient_ischemic_attack,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
131,tricuspid_valve_insufficiency,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
132,ulcer_peptic,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [153]:
alzheimer_row = final_df[final_df["disease"] == "Alzheimer's_disease"]
alzheimer_symptoms = alzheimer_row.loc[:, (alzheimer_row == 1).any()]

print(alzheimer_symptoms)


   agitation  bedridden^bedridden  consciousness_clear  cough  drool  \
0          1                    1                    1      1      1   

   facial_paresis  fever  frail  groggy  hyperkalemia  muscle_twitch  \
0               1      1      1       1             1              1   

   nightmare  pin-point_pupils  rhonchus  tremor  tremor_resting  \
0          1                 1         1       1               1   

   wheelchair_bound  
0                 1  


In [155]:
processed_df[processed_df["disease"] == "Alzheimer's_disease"]

Unnamed: 0,disease,symptom
1277,Alzheimer's_disease,drool
1278,Alzheimer's_disease,agitation
1279,Alzheimer's_disease,nightmare
1280,Alzheimer's_disease,rhonchus
1281,Alzheimer's_disease,consciousness_clear
1282,Alzheimer's_disease,pin-point_pupils
1283,Alzheimer's_disease,bedridden^bedridden
1284,Alzheimer's_disease,frail
1285,Alzheimer's_disease,tremor_resting
1286,Alzheimer's_disease,hyperkalemia
