In [7]:
import pandas as pd
from snomedGraphTool.graph import SNOMEDGraphTool
from snomedGraphTool.scorers import agg_difference

In [8]:
# Read graph files
relationships_file = 'data/sct2_Relationship_Full_INT_20240201.txt'
descriptions_file = 'data/sct2_Description_Full-en_INT_20240201.txt'

relationships = pd.read_csv(relationships_file, delimiter='\t')
descriptions = pd.read_csv(descriptions_file, delimiter='\t')
concept_dict = dict(zip(descriptions['conceptId'], descriptions['term']))

# Read patient data
all_events = pd.read_csv('data/events.csv')

In [9]:
# Define Cohort
covid_detected = 1240581000000104

# Define Class Label
death = 419099009

# Identify patients who had covid
covid_patients = all_events[all_events['snomedCode'] == covid_detected]['patient_id'].unique()
X = all_events[all_events['patient_id'].isin(covid_patients)]

# Identify patients who died
patients_died = X[X['snomedCode'] == death]['patient_id'].unique()
y = pd.DataFrame(covid_patients, columns=['patient_id'])
y['label'] = y['patient_id'].isin(patients_died).astype(int)
y_dict = dict(zip(y['patient_id'], y['label']))

# Remove death code from X
X = X[~(X['snomedCode'] == death)].reset_index(drop=True)

In [10]:
# Construct model
g = SNOMEDGraphTool(relationships, descriptions, X, y_dict)

# Define scorer
label_totals = dict(y['label'].value_counts())
scorer = lambda g, node: agg_difference(g, node, label_totals)

# Identify eligbible nodes to be used to form a predictive model
eligble_nodes = g.get_eligible_nodes(scorer, X['patient_id'].nunique(), rarity_threshold=0.05, min_depth=0.5, weight=0.2)

print('\nTop 10 features:')
for i in eligble_nodes[:10]:
    print(f"{i[0]} - {i[1]['label']} - {i[1]['weighted_score']}")

Building Graph: 100%|██████████| 1179020/1179020 [01:22<00:00, 14368.31it/s]
Assigning Attributes: 100%|██████████| 8198/8198 [02:38<00:00, 51.62it/s]
Updating Nodes: 100%|██████████| 8198/8198 [00:50<00:00, 162.95it/s]
Mapping Nodes: 100%|██████████| 8198/8198 [00:21<00:00, 388.79it/s]
Scoring Nodes: 100%|██████████| 8198/8198 [00:00<00:00, 278476.65it/s]
Weighting Node Scores: 100%|██████████| 8198/8198 [00:00<00:00, 417196.33it/s]



Top 10 features:
882784691000119100 - Pneumonia caused by severe acute respiratory syndrome coronavirus 2 (disorder) - 0.936779616745423
238131007 - Overweight (finding) - 0.430314988559284
73211009 - Diabetes mellitus (disorder) - 0.3588325397803898
38341003 - Hypertensive disorder, systemic arterial - 0.24094321115214526
5476005 - Adiposity (disorder) - 0.19735184616523635
248842004 - Female genitalia observations - 0.16320999618217288
363104002 - Hereditary disorder of endocrine system - 0.1207727486838512
398302004 - Facial dysmorphism - 0.11106984392901625
400038003 - Multiple congenital anomalies - 0.1061853379727229
116022009 - Multiple congenital malformations (disorder) - 0.1061853379727229
