In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# Source: https://loinc.org/download/loinc-complete/

In [3]:
data_path = r"data\Loinc_2.77\AccessoryFiles\ComponentHierarchyBySystem\ComponentHierarchyBySystem.csv"
data = pd.read_csv(data_path, header=0, index_col=None)
data.head(10)

Unnamed: 0,PATH_TO_ROOT,SEQUENCE,IMMEDIATE_PARENT,CODE,CODE_TEXT
0,,1,,LP432695-7,{component}
1,LP432695-7,1,LP432695-7,LP29693-6,Laboratory
2,LP432695-7.LP29693-6,1,LP29693-6,LP343406-7,Microbiology and Antimicrobial susceptibility
3,LP432695-7.LP29693-6.LP343406-7,1,LP343406-7,LP7819-8,Microbiology
4,LP432695-7.LP29693-6.LP343406-7.LP7819-8,1,LP7819-8,LP14559-6,Microorganism
5,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,1,LP14559-6,LP98185-9,Bacteria
6,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,1,LP98185-9,LP14082-9,Bacteria
7,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,1,LP14082-9,LP418667-4,Bacteria | Bronchoalveolar lavage | Microbiology
8,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,1,LP418667-4,95074-1,Bacteria BAL Ql Micro
9,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,2,LP418667-4,43441-5,Bacteria BAL Aerobe Cult


In [4]:
data.iloc[8].PATH_TO_ROOT

'LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP14559-6.LP98185-9.LP14082-9.LP418667-4'

In [5]:
filtered_loinc_data = data[data['CODE'].str.match(r'^\d')]
filtered_loinc_data.head()

Unnamed: 0,PATH_TO_ROOT,SEQUENCE,IMMEDIATE_PARENT,CODE,CODE_TEXT
8,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,1,LP418667-4,95074-1,Bacteria BAL Ql Micro
9,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,2,LP418667-4,43441-5,Bacteria BAL Aerobe Cult
10,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,3,LP418667-4,88683-8,Bacteria BAL Anaerobe Cult
12,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,1,LP373670-1,99932-6,Bacteria Fld Ql Auto
13,LP432695-7.LP29693-6.LP343406-7.LP7819-8.LP145...,2,LP373670-1,41599-2,Bacteria Fld Ql Micro


In [6]:
code_meaning = data[['CODE', 'CODE_TEXT']]
code_meaning.head()

Unnamed: 0,CODE,CODE_TEXT
0,LP432695-7,{component}
1,LP29693-6,Laboratory
2,LP343406-7,Microbiology and Antimicrobial susceptibility
3,LP7819-8,Microbiology
4,LP14559-6,Microorganism


In [9]:
# Function definition
def find_code_meanings(data, code):
    filtered_data = data[data['CODE'].str.match(r'^\d')]
    code_meanings = data[['CODE', 'CODE_TEXT']]
    # Find the row with the given code
    row = filtered_data[filtered_data['CODE'] == code]
    
    # If the code is not found, return empty lists
    if row.empty:
        return [], []
    
    # Extract the PATH_TO_ROOT for the given code
    path_to_root = row.iloc[0]['PATH_TO_ROOT']
    
    # Split the PATH_TO_ROOT at '.' to get the list of codes
    codes_list = path_to_root.split('.')
    
    # Find meanings for each code
    meanings_list = [code_meanings[code_meanings['CODE'] == c]['CODE_TEXT'].iloc[0] for c in codes_list if not code_meanings[code_meanings['CODE'] == c].empty]
    
    return codes_list, meanings_list


codes_list, meanings_list = find_code_meanings(data, code = '95074-1')

codes_list, meanings_list

(['LP432695-7',
  'LP29693-6',
  'LP343406-7',
  'LP7819-8',
  'LP14559-6',
  'LP98185-9',
  'LP14082-9',
  'LP418667-4'],
 ['{component}',
  'Laboratory',
  'Microbiology and Antimicrobial susceptibility',
  'Microbiology',
  'Microorganism',
  'Bacteria',
  'Bacteria',
  'Bacteria | Bronchoalveolar lavage | Microbiology'])

# Analysis
1. In this I consider multiple sequential loinc codes and show that some of them are related to each other.
2. Each loinc code has a heirachy which is not directly evident through the code itself.

In [42]:
code_example_1 = ['100010-8', "100011-6", "100012-4", "100013-2", "100016-5", "100017-3", "100018-1", "100019-9", "100020-7", "100021-5","24171-1", "12515-3"]

In [43]:
for code in code_example_1:
    codes_list, meanings_list = find_code_meanings(data, code)
    print(code, codes_list, meanings_list)


100010-8 ['LP432695-7', 'LP29696-9', 'LP248772-8'] ['{component}', 'Survey instruments', 'Survey terms not yet categorized']
100011-6 ['LP432695-7', 'LP29696-9', 'LP248772-8'] ['{component}', 'Survey instruments', 'Survey terms not yet categorized']
100012-4 ['LP432695-7', 'LP29696-9', 'LP248772-8'] ['{component}', 'Survey instruments', 'Survey terms not yet categorized']
100013-2 ['LP432695-7', 'LP29696-9', 'LP248772-8'] ['{component}', 'Survey instruments', 'Survey terms not yet categorized']
100016-5 ['LP432695-7', 'LP29696-9', 'LP248772-8'] ['{component}', 'Survey instruments', 'Survey terms not yet categorized']
100017-3 ['LP432695-7', 'LP29696-9', 'LP248772-8'] ['{component}', 'Survey instruments', 'Survey terms not yet categorized']
100018-1 ['LP432695-7', 'LP7787-7', 'LP32519-8', 'LP261247-3', 'LP72988-6', 'LP416850-8'] ['{component}', 'Clinical', 'Document ontology', 'LOINC Document Ontology - Type of Service and Kind of Document', 'Note', 'Note | {Setting} | Document ontology

It looks like there are code that are not categorized. Lets find them

In [54]:
filtered_df = data[data['CODE_TEXT'].str.contains('not yet categorized', case=False)]
filtered_df.head()

Unnamed: 0,PATH_TO_ROOT,SEQUENCE,IMMEDIATE_PARENT,CODE,CODE_TEXT
102917,LP432695-7.LP29693-6,24,LP29693-6,LP248770-2,Lab terms not yet categorized
155736,LP432695-7.LP7787-7,28,LP7787-7,LP248771-0,Clinical terms not yet categorized
168155,LP432695-7.LP29696-9,1,LP29696-9,LP248772-8,Survey terms not yet categorized


In [55]:
no_lab_code = 'LP248770-2'
no_clinical_code = "LP248771-0"
no_instrmnt_code = 'LP248772-8'
print("total number ")
len(data[data['IMMEDIATE_PARENT'] == no_lab_code]), len(data[data['IMMEDIATE_PARENT'] == no_instrmnt_code]), len(data[data['IMMEDIATE_PARENT'] == no_clinical_code])

(26297, 12253, 11256)

In [57]:
# Find all the chilren of LP432695-7 
data[data['IMMEDIATE_PARENT']=="LP432695-7"]

Unnamed: 0,PATH_TO_ROOT,SEQUENCE,IMMEDIATE_PARENT,CODE,CODE_TEXT
1,LP432695-7,1,LP432695-7,LP29693-6,Laboratory
129215,LP432695-7,2,LP432695-7,LP7787-7,Clinical
166993,LP432695-7,3,LP432695-7,LP29695-1,Attachments
168154,LP432695-7,4,LP432695-7,LP29696-9,Survey instruments


In [59]:
lab_code, clinical_code, attachments_code, instrmnt_code = 'LP29693-6', 'LP7787-7', 'LP29695-1', 'LP29696-9'

In [74]:
# Function to filter data by code and print details
def loinc_details(data, filtered_data, code, description):
    # data_filtered = data[data['PATH_TO_ROOT'].str.contains(code, case=False)]
    filtered_data_specific = filtered_data[filtered_data['PATH_TO_ROOT'].str.contains(code, case=False)]
    
    # total_rows = data_filtered.shape[0]
    total_filtered_rows = filtered_data_specific.shape[0]
    total_in_filtered_data = filtered_data.shape[0]
    
    # Calculate percentages
    # percentage_of_total = (total_rows / data.shape[0]) * 100
    percentage_of_filtered = (total_filtered_rows / total_in_filtered_data) * 100
    
    # print(f"Total rows for {description}: {total_rows} ({percentage_of_total:.2f}%)")
    print(f"Total {description} codes in LOINC codes: {total_filtered_rows} out of {total_in_filtered_data} ({percentage_of_filtered:.2f}%)")

# Example usage for each code
loinc_details(data, filtered_loinc_data, lab_code, "Laboratory")
loinc_details(data, filtered_loinc_data, clinical_code, "Clinical")
loinc_details(data, filtered_loinc_data, attachments_code, "Attachments")
loinc_details(data, filtered_loinc_data, instrmnt_code, "Survey Instruments")

Total Laboratory codes in LOINC codes: 63121 out of 103832 (60.79%)
Total Clinical codes in LOINC codes: 27298 out of 103832 (26.29%)
Total Attachments codes in LOINC codes: 1160 out of 103832 (1.12%)
Total Survey Instruments codes in LOINC codes: 12253 out of 103832 (11.80%)


In [83]:
# Function to filter data by code and print details
def component_hierarchy_details(data, filtered_data, code, description):
    data_filtered = data[data['PATH_TO_ROOT'].str.contains(code, case=False)]
    # filtered_data_specific = filtered_data[filtered_data['PATH_TO_ROOT'].str.contains(code, case=False)]
    
    total_rows = data_filtered.shape[0]
    # total_filtered_rows = filtered_data_specific.shape[0]
    # total_in_filtered_data = filtered_data.shape[0]
    
    # Calculate percentages
    percentage_of_total = (total_rows / data.shape[0]) * 100
    # percentage_of_filtered = (total_filtered_rows / total_in_filtered_data) * 100
    
    print(f"{description}: {total_rows} ({percentage_of_total:.2f}%)")
    # print(f"Total {description} codes in LOINC codes: {total_filtered_rows} out of {total_in_filtered_data} ({percentage_of_filtered:.2f}%)")

# Example usage for each code
component_hierarchy_details(data, filtered_loinc_data, lab_code, "Laboratory")
component_hierarchy_details(data, filtered_loinc_data, clinical_code, "Clinical")
component_hierarchy_details(data, filtered_loinc_data, attachments_code, "Attachments")
component_hierarchy_details(data, filtered_loinc_data, instrmnt_code, "Survey Instruments")

Laboratory: 129213 (71.62%)
Clinical: 37777 (20.94%)
Attachments: 1160 (0.64%)
Survey Instruments: 12254 (6.79%)


In [88]:
def level_2(data, code, description, print_l3=False):
    filtered_data = data[data['IMMEDIATE_PARENT']==code]
    print(f"{description} codes: {filtered_data.shape[0]}")
    if print_l3:
        print(data[data['IMMEDIATE_PARENT']==code])
    return filtered_data
_ = level_2(data, lab_code, "Laboratory")
_ = level_2(data, clinical_code, "Clinical")
_ = level_2(data, attachments_code, "Attachments")
_ = level_2(data, instrmnt_code, "Survey Instruments")

Laboratory codes: 24
Clinical codes: 28
Attachments codes: 1160
Survey Instruments codes: 1


In [94]:
attachments_data = level_2(data, attachments_code, "Attachments", print_l3=False)
attachments_data['CODE_TEXT']

Attachments codes: 1160


166994                                           Deprecated
166995                                           Deprecated
166996                                           Deprecated
166997                                           Deprecated
166998                                           Deprecated
                                ...                        
168149    Selection item attachment request modifier cod...
168150    Send all abnormals within the time window:Scop...
168151    Send all items of the specified type within th...
168152    Send all items of the specified type within th...
168153    Send document that conforms to C-CDA R1.1 temp...
Name: CODE_TEXT, Length: 1160, dtype: object

In [97]:
# Function to filter data by code and print details
def no_loinc_details(data, filtered_data, code, no_code_sub, description):
    # data_filtered = data[data['PATH_TO_ROOT'].str.contains(code, case=False)]
    filtered_data_specific = filtered_data[filtered_data['PATH_TO_ROOT'].str.contains(code, case=False)]
    filtered_data_specific_no_code = filtered_data[filtered_data['PATH_TO_ROOT'].str.contains(no_code_sub, case=False)]

    # total_rows = data_filtered.shape[0]
    total_in_filtered_data = filtered_data.shape[0] # total loinc codes
    total_filtered_rows = filtered_data_specific.shape[0] # Loinc code filtered by descpription
    total_no_code = filtered_data_specific_no_code.shape[0] # total codes filtered by descpription that are not categorized
    
    print(total_in_filtered_data, total_filtered_rows, total_no_code)

no_loinc_details(data, filtered_loinc_data,lab_code, no_lab_code, "Laboratory")
no_loinc_details(data, filtered_loinc_data,clinical_code, no_clinical_code, "Clinical")
# loinc_details(data, filtered_loinc_data, no_attachments_code, "No Attachments")
no_loinc_details(data, filtered_loinc_data, instrmnt_code, no_instrmnt_code, "Survey Instruments")

103832 63121 26297
103832 27298 11256
103832 12253 12253


In [107]:
import pandas as pd

# Define the structure for accumulating results
results = []

def no_loinc_details(data, filtered_data, code, no_code_sub, description, results):
    # Filtering based on specified codes
    filtered_data_specific = filtered_data[filtered_data['PATH_TO_ROOT'].str.contains(code, case=False)]
    filtered_data_specific_no_code = filtered_data[filtered_data['PATH_TO_ROOT'].str.contains(no_code_sub, case=False)]

    # Calculating totals and percentages
    total_in_filtered_data = filtered_data.shape[0]
    total_filtered_rows = filtered_data_specific.shape[0]
    total_no_code = filtered_data_specific_no_code.shape[0]
    percentage_matching = (total_filtered_rows / total_in_filtered_data) * 100 if total_in_filtered_data > 0 else 0
    percentage_no_code = (total_no_code / total_filtered_rows) * 100 if total_in_filtered_data > 0 else 0

    # Appending the calculated values to the results list
    results.append({
        'Category': description,
        # 'Total in Filtered Data': total_in_filtered_data,
        '#-Codes': total_filtered_rows,
        '%-Codes': f"{percentage_matching:.2f}%",
        '#-No Hierarchy': total_no_code,
        '%-Not Hierarchy': f"{percentage_no_code:.2f}%"
    })

# Example usage
no_loinc_details(data, filtered_loinc_data, lab_code, no_lab_code, "Laboratory", results)
no_loinc_details(data, filtered_loinc_data, clinical_code, no_clinical_code, "Clinical", results)
no_loinc_details(data, filtered_loinc_data, instrmnt_code, no_instrmnt_code, "Survey Instruments", results)
no_loinc_details(data, filtered_loinc_data, attachments_code, attachments_code, "Attachments code", results)

attachments_code
# Create a DataFrame from the accumulated results
results_df = pd.DataFrame(results)

# Display the results table
print(results_df)


             Category  #-Codes %-Codes  #-No Hierarchy %-Not Hierarchy
0          Laboratory    63121  60.79%           26297          41.66%
1            Clinical    27298  26.29%           11256          41.23%
2  Survey Instruments    12253  11.80%           12253         100.00%
3    Attachments code     1160   1.12%            1160         100.00%


In [111]:
percent_no_hierarchy = 100*sum(results_df["#-No Hierarchy"])/sum(results_df["#-Codes"])
print(percent_no_hierarchy)

49.08506048231759


# Laboratory


In [89]:
level_2(data, lab_code, "Laboratory", print_l3=True)

Laboratory codes: 24
                PATH_TO_ROOT  SEQUENCE IMMEDIATE_PARENT        CODE  \
2       LP432695-7.LP29693-6         1        LP29693-6  LP343406-7   
21521   LP432695-7.LP29693-6         2        LP29693-6    LP7785-1   
21603   LP432695-7.LP29693-6         3        LP29693-6  LP343631-0   
46783   LP432695-7.LP29693-6         4        LP29693-6    LP7790-1   
63395   LP432695-7.LP29693-6         5        LP29693-6    LP7791-9   
64408   LP432695-7.LP29693-6         6        LP29693-6    LP7803-2   
68871   LP432695-7.LP29693-6         7        LP29693-6    LP7788-5   
70804   LP432695-7.LP29693-6         8        LP29693-6    LP7756-2   
80474   LP432695-7.LP29693-6         9        LP29693-6    LP7776-0   
82858   LP432695-7.LP29693-6        10        LP29693-6    LP7783-6   
86690   LP432695-7.LP29693-6        11        LP29693-6    LP7798-4   
87258   LP432695-7.LP29693-6        12        LP29693-6    LP7806-5   
88673   LP432695-7.LP29693-6        13        LP29693-6 

Unnamed: 0,PATH_TO_ROOT,SEQUENCE,IMMEDIATE_PARENT,CODE,CODE_TEXT
2,LP432695-7.LP29693-6,1,LP29693-6,LP343406-7,Microbiology and Antimicrobial susceptibility
21521,LP432695-7.LP29693-6,2,LP29693-6,LP7785-1,Skin challenge
21603,LP432695-7.LP29693-6,3,LP29693-6,LP343631-0,Chemistry and Chemistry - challenge
46783,LP432695-7.LP29693-6,4,LP29693-6,LP7790-1,Drug toxicology
63395,LP432695-7.LP29693-6,5,LP29693-6,LP7791-9,Drug doses
64408,LP432695-7.LP29693-6,6,LP29693-6,LP7803-2,Hematology and Cell counts
68871,LP432695-7.LP29693-6,7,LP29693-6,LP7788-5,Coagulation
70804,LP432695-7.LP29693-6,8,LP29693-6,LP7756-2,Allergy
80474,LP432695-7.LP29693-6,9,LP29693-6,LP7776-0,Blood bank
82858,LP432695-7.LP29693-6,10,LP29693-6,LP7783-6,Cell markers


# Survey Instruments

In [86]:
level_2(data, instrmnt_code, "Survey Instruments", print_l3=True)

Survey Instruments codes: 1
                PATH_TO_ROOT  SEQUENCE IMMEDIATE_PARENT        CODE  \
168155  LP432695-7.LP29696-9         1        LP29696-9  LP248772-8   

                               CODE_TEXT  
168155  Survey terms not yet categorized  


In [78]:
data[data['IMMEDIATE_PARENT']==lab_code]

Unnamed: 0,PATH_TO_ROOT,SEQUENCE,IMMEDIATE_PARENT,CODE,CODE_TEXT
2,LP432695-7.LP29693-6,1,LP29693-6,LP343406-7,Microbiology and Antimicrobial susceptibility
21521,LP432695-7.LP29693-6,2,LP29693-6,LP7785-1,Skin challenge
21603,LP432695-7.LP29693-6,3,LP29693-6,LP343631-0,Chemistry and Chemistry - challenge
46783,LP432695-7.LP29693-6,4,LP29693-6,LP7790-1,Drug toxicology
63395,LP432695-7.LP29693-6,5,LP29693-6,LP7791-9,Drug doses
64408,LP432695-7.LP29693-6,6,LP29693-6,LP7803-2,Hematology and Cell counts
68871,LP432695-7.LP29693-6,7,LP29693-6,LP7788-5,Coagulation
70804,LP432695-7.LP29693-6,8,LP29693-6,LP7756-2,Allergy
80474,LP432695-7.LP29693-6,9,LP29693-6,LP7776-0,Blood bank
82858,LP432695-7.LP29693-6,10,LP29693-6,LP7783-6,Cell markers


In [68]:
# Define codes for different categories
lab_code, clinical_code, attachments_code, instrmnt_code = 'LP29693-6', 'LP7787-7', 'LP29695-1', 'LP29696-9'

# Convert 'PATH_TO_ROOT' column to string to ensure proper matching
data['PATH_TO_ROOT'] = data['PATH_TO_ROOT'].astype(str)

# Filter for laboratory data
lab_data_filtered = data[data['PATH_TO_ROOT'].str.contains(lab_code, case=False)]
# Similarly filter 'filtered_loinc_data' for laboratory data
loinc_lab_filtered = filtered_loinc_data[filtered_loinc_data['PATH_TO_ROOT'].str.contains(lab_code, case=False)]

# Output the count of filtered rows for laboratory data
print(f"Total rows for Laboratory data: {lab_data_filtered.shape[0]}")
print(f"Total LOINC codes for Laboratory out of filtered LOINC data: {loinc_lab_filtered.shape[0]} out of {filtered_loinc_data.shape[0]}")


Total rows for Laboratory data: 129213
Total LOINC codes for Laboratory out of filtered LOINC data: 63121 out of 103832


In [67]:
lab_code, clinical_code, attachments_code, instrmnt_code = 'LP29693-6', 'LP7787-7', 'LP29695-1', 'LP29696-9'
lab_df = data[data['PATH_TO_ROOT'].astype(str).str.contains(lab_code, case=False)]
loinc_lab_data = filtered_loinc_data[filtered_loinc_data['PATH_TO_ROOT'].astype(str).str.contains(lab_code, case=False)]
print(f"There are totally {lab_df.shape[0]} rows")
print(f"There are totally {loinc_lab_data.shape[0]} loinc codes for Laboratory, out of {filtered_loinc_data.shape[0]}")


There are totally 129213 rows
There are totally 63121 loinc codes for Laboratory, out of 103832


In [37]:
# def find_code_meanings_row(row, all_data):
#     # Extract 'CODE' from the current row
#     code = row['CODE']
    
#     # Check for necessary columns
#     if not {'CODE', 'CODE_TEXT', 'PATH_TO_ROOT'}.issubset(all_data.columns):
#         raise ValueError("Data must contain 'CODE', 'CODE_TEXT', and 'PATH_TO_ROOT' columns")
    
#     # Create a DataFrame indexed by 'CODE' for quick lookups
#     code_meanings = all_data[['CODE', 'CODE_TEXT']].drop_duplicates().set_index('CODE')
    
#     # Attempt to find the path to root for the given code; handle missing or non-string values
#     try:
#         path_to_root = all_data.set_index('CODE').loc[code, 'PATH_TO_ROOT']
#         # Ensure path_to_root is a string
#         path_to_root = str(path_to_root)
#     except KeyError:
#         return pd.Series([[], []])  # Return empty lists if code is not found
    
#     # Split path_to_root into codes, ensuring it's a non-empty string before splitting
#     codes_list = path_to_root.split('.') if path_to_root else []

#     # Find meanings for each code, ensuring the code exists in code_meanings before attempting to access
#     meanings_list = [code_meanings.loc[c, 'CODE_TEXT'] for c in codes_list if c in code_meanings.index]

#     return pd.Series([codes_list, meanings_list])

# data[['codes_list', 'meanings_list']] = data.apply(find_code_meanings_row, all_data=data.copy(), axis=1)