In [1]:
import os
import re
import json 
import rdflib 
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from collections import defaultdict
from rdflib import RDF, RDFS, OWL, XMLNS, XSD

In [2]:
g = rdflib.Graph()
g.parse("symptom_kg.owl", format="turtle")
ns = rdflib.Namespace("http://www.semanticweb.org/admin/ontologies/2021/10/untitled-ontology-8#")
ns2 = rdflib.Namespace("http://www.w3.org/2002/07/owl#")
ns3 = rdflib.Namespace("http://www.w3.org/2000/01/rdf-schema#")

diseases = list(g.subjects(RDF.type, ns["Mental_Disease"]))

In [3]:
def get_uri_name(x):
    return x[len("http://www.semanticweb.org/admin/ontologies/2021/10/untitled-ontology-8#"):]

In [4]:
id2disease = [
    'adhd',
    'anxiety',
    'bipolar_disorder',
    'depression',
    'eating_disorder',
    'ocd',
    'ptsd',
]
disease2id = {x:i for i, x in enumerate(id2disease)}

In [5]:
symptoms = sorted(list(g.subjects(RDF.type, ns["Symptom"])))
id2symptoms = [get_uri_name(x) for x in symptoms]
symptom2id = {x:i for i, x in enumerate(id2symptoms)}
print(id2symptoms)

['Anger_Irritability', 'Anxious_Mood', 'Autonomic_symptoms', 'Cardiovascular_symptoms', 'Catatonic_behavior', 'Decreased_energy_tiredness_fatigue', 'Depressed_Mood', 'Gastrointestinal_symptoms', 'Genitourinary_symptoms', 'Hyperactivity_agitation', 'Impulsivity', 'Inattention', 'Indecisiveness', 'Respiratory_symptoms', 'Suicidal_ideas', 'Worthlessness_and_guilty', 'avoidance_of_stimuli', 'compensatory_behaviors_to_prevent_weight_gain', 'compulsions', 'diminished_emotional_expression', 'do_things_easily_get_painful_consequences', 'drastical_shift_in_mood_and_energy', 'fear_about_social_situations', 'fear_of_gaining_weight', 'fears_of_being_negatively_evaluated', 'flight_of_ideas', 'intrusion_symptoms', 'loss_of_interest_or_motivation', 'more_talktive', 'obsession', 'panic_fear', 'pessimism', 'poor_memory', 'sleep_disturbance', 'somatic_muscle', 'somatic_symptoms_others', 'somatic_symptoms_sensory', 'weight_and_appetite_change']


In [6]:
# the descriptions of certain symptom uses positive samples
desc_from_post = {}
# for fname in os.listdir("./desc_from_post/"):
#     symp = fname[:-4]
#     desc_from_post[symp] = open("./desc_from_post/"+fname).read().split('\n')
# print(desc_from_post)

In [7]:
id2desc = []
symp_id2desc_range = [[0, 0] for symp in id2symptoms]
left = 0
for symp_id, symp in enumerate(id2symptoms):
    if symp in desc_from_post:
        print(symp)
        descs = desc_from_post[symp]
    else:
        descs = sorted(list(g.objects(ns[symp], ns["Subsymptoms"])))
        descs = [str(x) for x in descs]
    id2desc.extend(descs)
    symp_id2desc_range[symp_id] = [left, len(id2desc)]
    left = len(id2desc)
print(symp_id2desc_range)

[[0, 6], [6, 12], [12, 22], [22, 23], [23, 26], [26, 30], [30, 34], [34, 36], [36, 43], [43, 52], [52, 55], [55, 70], [70, 71], [71, 73], [73, 77], [77, 89], [89, 91], [91, 96], [96, 101], [101, 107], [107, 114], [114, 116], [116, 120], [120, 127], [127, 133], [133, 136], [136, 140], [140, 148], [148, 153], [153, 167], [167, 168], [168, 170], [170, 176], [176, 190], [190, 196], [196, 199], [199, 206], [206, 218]]


In [8]:
desc2id = {x:i for i, x in enumerate(id2desc)}

In [9]:
symp_id = 2
print(id2symptoms[symp_id])
id2desc[symp_id2desc_range[symp_id][0]:symp_id2desc_range[symp_id][1]]

Autonomic_symptoms


['Feeling dizzy, unsteady, light-headed, or faint.',
 'accelerated heart rate',
 'dry mouth',
 'flushing',
 'giddiness',
 'headache',
 'pallor',
 'raising of hair',
 'shortness of breath',
 'tendency to sweat']

In [10]:
symp_id2disease_ids = [[] for i in range(len(id2symptoms))]
for symptom in symptoms:
    diseases = list(get_uri_name(x).lower() for x in g.objects(symptom, ns['IsSymptomOf']))
    symp = get_uri_name(symptom)
    print(symp, diseases)
    symp_id2disease_ids[symptom2id[symp]] = [disease2id[d] for d in diseases]
print(symp_id2disease_ids)

Anger_Irritability ['anxiety', 'bipolar_disorder', 'depression', 'eating_disorder', 'ptsd']
Anxious_Mood ['anxiety', 'ocd', 'ptsd']
Autonomic_symptoms ['anxiety', 'ptsd']
Cardiovascular_symptoms ['anxiety']
Catatonic_behavior ['ptsd']
Decreased_energy_tiredness_fatigue ['anxiety', 'bipolar_disorder', 'depression']
Depressed_Mood ['anxiety', 'bipolar_disorder', 'depression', 'eating_disorder', 'ocd']
Gastrointestinal_symptoms ['anxiety']
Genitourinary_symptoms ['anxiety', 'depression', 'eating_disorder']
Hyperactivity_agitation ['adhd', 'anxiety', 'bipolar_disorder', 'depression']
Impulsivity ['adhd']
Inattention ['adhd', 'anxiety', 'bipolar_disorder', 'depression', 'ptsd']
Indecisiveness ['depression']
Respiratory_symptoms ['anxiety']
Suicidal_ideas ['bipolar_disorder', 'depression']
Worthlessness_and_guilty ['bipolar_disorder', 'depression', 'eating_disorder', 'ptsd']
avoidance_of_stimuli ['ptsd']
compensatory_behaviors_to_prevent_weight_gain ['eating_disorder']
compulsions ['ocd']
di

In [11]:
subreddit2disease = json.load(open("subreddit2disease.json"))
len(subreddit2disease)

53

In [12]:
# don't use subreddit for filtered diseases
id2subreddit = sorted(k for k, v in subreddit2disease.items() if v.lower() != 'autism')
subreddit2id = {x:i for i, x in enumerate(id2subreddit)}

In [13]:
with open("parsed_kg_info.json", "w") as f:
    ret = {
        'id2disease': id2disease,
        'disease2id': disease2id,
        'symp2id': symptom2id,
        'id2symp': id2symptoms,
        'desc2id': desc2id,
        'id2desc': id2desc,
        'symp_id2desc_range': symp_id2desc_range,
        'symp_id2disease_ids': symp_id2disease_ids,
        'id2subreddit': id2subreddit,
        'subreddit2id': subreddit2id,
    }
    json.dump(ret, f, indent=4, ensure_ascii=False)