## Disease Prediction from Symptoms

Dataset Source: Raw data from [here](http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html) 

In [1]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_excel('./dataset/raw_data.xlsx')

In [3]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [4]:
data = df.fillna(method='ffill')

In [5]:
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [6]:
list(data)

['Disease', 'Count of Disease Occurrence', 'Symptom']

In [7]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [8]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [9]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [10]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [13]:
df1 = pd.DataFrame(list(disease_symptom_dict.items()), columns=['Disease','Symptom'])

In [14]:
df1.head()

Unnamed: 0,Disease,Symptom
0,hypertensive disease,"[pain chest, shortness of breath, dizziness, a..."
1,diabetes,"[polyuria, polydypsia, shortness of breath, pa..."
2,depression mental,"[feeling suicidal, suicidal, hallucinations au..."
3,depressive disorder,"[feeling suicidal, suicidal, hallucinations au..."
4,coronary arteriosclerosis,"[pain chest, angina pectoris, shortness of bre..."


In [18]:

for vals in disease_symptom_count.items():
    print(vals[1])

3363.0
1421.0
1337.0
1337.0
1284.0
1284.0
1029.0
963.0
885.0
835.0
759.0
685.0
630.0
597.0
544.0
524.0
504.0
445.0
408.0
405.0
398.0
390.0
354.0
354.0
350.0
350.0
350.0
341.0
325.0
311.0
311.0
311.0
310.0
297.0
297.0
294.0
290.0
283.0
280.0
269.0
269.0
268.0
267.0
247.0
241.0
228.0
226.0
218.0
208.0
192.0
186.0
186.0
179.0
172.0
171.0
169.0
168.0
166.0
165.0
165.0
165.0
165.0
164.0
163.0
163.0
161.0
160.0
158.0
152.0
152.0
147.0
145.0
144.0
143.0
142.0
140.0
142.0
140.0
140.0
138.0
135.0
133.0
128.0
126.0
124.0
123.0
122.0
119.0
114.0
114.0
114.0
113.0
111.0
108.0
105.0
104.0
103.0
101.0
101.0
99.0
99.0
99.0
96.0
96.0
95.0
94.0
94.0
94.0
93.0
92.0
92.0
90.0
90.0
87.0
87.0
86.0
86.0
85.0
84.0
82.0
80.0
80.0
76.0
76.0
76.0
74.0
71.0
71.0
71.0
70.0
68.0
68.0
68.0
68.0
68.0
67.0
67.0
66.0
61.0
61.0
61.0
61.0
61.0
56.0
56.0
57.0
56.0
45.0
42.0


In [16]:
df1.head()

Unnamed: 0,Disease,Symptom,Count of Disease Occurrence
0,hypertensive disease,"[pain chest, shortness of breath, dizziness, a...","(hypertensive disease, 3363.0)"
1,diabetes,"[polyuria, polydypsia, shortness of breath, pa...","(diabetes, 1421.0)"
2,depression mental,"[feeling suicidal, suicidal, hallucinations au...","(depression mental, 1337.0)"
3,depressive disorder,"[feeling suicidal, suicidal, hallucinations au...","(depressive disorder, 1337.0)"
4,coronary arteriosclerosis,"[pain chest, angina pectoris, shortness of bre...","(coronary arteriosclerosis, 1284.0)"
