In [None]:
import pandas as pd
import numpy as np
import csv
import os

In [None]:
os.getcwd()

# SNOMED Files
https://www.nlm.nih.gov/healthit/snomedct/international.html </br>
This script uses 2024-03-01 version



In [None]:
### sct2_Description_Snapshot-en_INT_20240301.txt: it containts SNOMED ID and the text description.
# Example: 85828009 Autoimmune disease (disorder)
delimiter = '\t'

file_path = os.getcwd() + '/SnomedCT_InternationalRF2_PRODUCTION_20240301T120000Z/Snapshot/Terminology/'
file = file_path+'sct2_Description_Snapshot-en_INT_20240301.txt'
df_description = pd.read_csv(file, delimiter=delimiter, on_bad_lines="warn")
print(df_description.shape)

### sct2_Relationship_Snapshot_INT_20240301.txt: it contains the relationships between SNOMED IDs.
file = file_path+'sct2_Relationship_Snapshot_INT_20240301.txt'
df_relationship = pd.read_csv(file, delimiter=delimiter, on_bad_lines="warn")
print(df_relationship.shape)


# Define IDs to be excluded/marked as excluded

In [None]:
### if your project requires you to exclude certain IDs, we will first find all excluded IDs and their descendants

def find_all_excluded_ids(start_ids, df):
    # Initialize list to hold all descendants
    all_descendants = []

    # Process each starting ID in the list
    for start_id in start_ids:
        # Initialize queue and visited set for each start_id
        queue = [start_id]
        visited = set()

        while queue:
            current_id = queue.pop(0)

            if current_id not in visited:
                visited.add(current_id)
                # Filter the DataFrame based on the given conditions
                # active: means it is valid in the current version we are using
                # typeId as 116680003: "Is a (attribute)", this indicates parent-child relationship
                # https://browser.ihtsdotools.org/?perspective=full&conceptId1=116680003&edition=MAIN/2024-05-01&release=&languages=en&latestRedirect=false
                condition = (df["destinationId"] == current_id) & (df["active"] == 1) & (df["typeId"] == 116680003)
                filtered_df = df[condition]
                filtered_df = filtered_df.astype('int64')

                # Collect all sourceIds which are the descendants of the current_id
                current_descendants = filtered_df['sourceId'].tolist()
                all_descendants.extend(current_descendants)

                # Queue up the new descendants for further exploration
                queue.extend(current_descendants)

    # Remove duplicates from the list of all descendants
    all_descendants = list(set(all_descendants))

    return all_descendants




# find all descendants for the parent IDs we want

In [None]:
### find all descendants and store them in a dataframe; mark the IDs in excluded_ids
# this output can be useful for healthcare providers to review if certain codes are missing or need to be validated. 
# Instead of directly removing the codes, we use the column "Include Y/N" to denote their suitability

def find_all_descendants_df(start_id, df, excluded_ids=[]):
    # Define IDs for special marking in the 'Include Y/N' column

    # Initialize lists to hold results
    ancestor_ids = []
    candidate_ids = []
    include_flags = []

    # Start with the initial ancestor
    queue = [(start_id, start_id in excluded_ids)]
    visited = set()

    while queue:
        current_id, is_excluded_origin = queue.pop(0)
        
        if current_id not in visited:
            visited.add(current_id)
            # Filter the DataFrame based on the given conditions
            condition = (df["destinationId"] == current_id) & (df["active"] == 1) & (df["typeId"] == 116680003)
            filtered_df = df[condition]
            filtered_df = filtered_df.astype('int64')
            
            # Collect all sourceIds which are the descendants of the current_id
            current_descendants = filtered_df['sourceId'].tolist()
            for descendant in current_descendants:
                # Add details to lists
                ancestor_ids.append(start_id)
                candidate_ids.append(int(descendant))
                # If the current branch is from an excluded origin, mark descendant as 'N'
                include_flags.append('N' if is_excluded_origin or descendant in excluded_ids else 'Y')
                # Queue up the new descendants for further exploration
                queue.append((descendant, is_excluded_origin or descendant in excluded_ids))

    # Create a DataFrame for the results
    results_df = pd.DataFrame({
        "Ancestor SCTID": ancestor_ids,
        "Candidate SCTID": candidate_ids,
        "Include Y/N": include_flags
    })

    results_df = results_df.drop_duplicates(subset=['Candidate SCTID'])
    print("check the number of rows with 'Details' on SNOMED CT browser for SNOMED ID: %d"%start_id)
    print(results_df.shape) 
    # you can check the number of rows with "Details" on SNOMED CT browser
    # Example: It shows "Defined, Active. Descendants Count: 688 concepts." for Autoimmune disease (disorder)

    return results_df

# Executing the functions

In [None]:
# this is the list of codes we want to exclude as they do not quality as autoimmune diseases
excluded_id_0 = [778004006, 829973009, 1148765006, 78069008, 1197477000, 
                86081009, 1186652002, 723384004, 20005002, 426760008]

excluded_ids = find_all_excluded_ids(excluded_id_0, df_relationship)
print(len(excluded_ids))

# add the parent IDs and now we will have a complete list of IDs that we will exclude/marked as excluded later
excluded_ids.extend(excluded_id_0)
print(len(excluded_ids))


In [None]:
### in this project, we consider the following conditions
### 1. Immune hypersensitivity disorder by mechanism (disorder) 427439005
# including psoriasis, etc.
# 426760008 (Delayed hypersensitivity disorder (disorder)) is excluded
### 2. Autoinflammatory disease (disorder) 42111000175103
### 3. Multiple sclerosis (disorder) 24700007
### 4. Spondyloarthritis (disorder) 784332006
### 5. Diabetes mellitus type 1 (disorder) 46635009
### 6. Pyoderma gangrenosum (disorder) 74578003

df_IDs = pd.DataFrame(columns=['Ancestor SCTID', 'Candidate SCTID', 'Include Y/N'])
included_ids=[85828009, 427439005, 42111000175103, 24700007, 784332006, 46635009, 74578003] #, 
for i in included_ids:
    df = find_all_descendants_df(i, df_relationship, excluded_ids)
    df_IDs = pd.concat([df_IDs,df])

In [None]:
print(df_IDs.shape)

# adding text description to IDs

In [None]:
### add text description of SNOME IDs
condition = (df_description["typeId"]==900000000000003001)
df_ancestor = df_description[condition].reset_index(drop=True)
df_ancestor = df_ancestor[["conceptId", "term"]]
df_ancestor = df_ancestor.rename(columns={"conceptId":"Ancestor SCTID", "term":"Ancestor Text"})
df_ancestor.head()

condition = (df_description["typeId"]==900000000000003001)
df_candidate = df_description[condition].reset_index(drop=True)
df_candidate = df_candidate[["conceptId", "term"]]
df_candidate = df_candidate.rename(columns={"conceptId":"Candidate SCTID", "term":"Candidate Text"})
df_candidate.head()

In [None]:
df_all_1 = df_IDs.merge(df_ancestor, on="Ancestor SCTID", how="left")
display(df_all_1.head())
print(df_all_1.shape)

df_all_2 = df_all_1.merge(df_candidate, on="Candidate SCTID", how="left")
display(df_all_2.head())
print(df_all_2.shape)

# df_all_2 may have more rows since sometimes there is more than one description for a SNOMED IDs. You can choose to keep all or drop duplicates and keep the first one.
df_all_3 = df_all_2.drop_duplicates(subset=['Ancestor SCTID', 'Candidate SCTID'], keep='first')
print(df_all_3.shape)

# reorder columns
cols = ['Ancestor SCTID', 'Ancestor Text', 'Candidate SCTID', 'Candidate Text', 'Include Y/N']
df_all_4 = df_all_3[cols]
display(df_all_4.head())
print(df_all_4.shape)

# Saving files

In [None]:
### lastly, we will save the final dataframe to excel.

# we first convert all values to string. This is important since saving integers to excel will sometimes make the numbers as scientific numbers, and the values will be off
df_all_4 = df_all_4.astype(str)
df_all_4.to_excel('AD_SNOMED_version_2024-03-01.xlsx', index=False, engine='openpyxl')