<a href="https://colab.research.google.com/github/ayishamanzoor2024/ayishamanzoor2024/blob/main/RareDiseaseProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
# Change directory to the folder named "dataset" inside "My Drive"
os.chdir('/content/drive/My Drive/dataset')

# List the files in the current directory to verify the path
!ls

Clinical.csv	      Diagnosis.csv	     Labs.csv	       results_asmd_with_rules.csv
Clinical_gaucher.csv  Diagnosis_gaucher.csv  Labs_gaucher.csv  results_gaucher_with_rules.csv


In [3]:
import pandas as pd

# Loading the datasets into DataFrames
clinical_df_asmd = pd.read_csv('Clinical.csv')
labs_df_asmd = pd.read_csv('Labs.csv')
diagnosis_df_asmd = pd.read_csv('Diagnosis.csv')

clinical_df_gaucher = pd.read_csv('Clinical_gaucher.csv')
labs_df_gaucher = pd.read_csv('Labs_gaucher.csv')
diagnosis_df_gaucher = pd.read_csv('Diagnosis_gaucher.csv')

# Displaying the first few rows of each DataFrame to understand their structure
print("Clinical ASMD DataFrame:")
print(clinical_df_asmd.head())
print("\nLabs ASMD DataFrame:")
print(labs_df_asmd.head())
print("\nDiagnosis ASMD DataFrame:")
print(diagnosis_df_asmd.head())

print("\nClinical Gaucher DataFrame:")
print(clinical_df_gaucher.head())
print("\nLabs Gaucher DataFrame:")
print(labs_df_gaucher.head())
print("\nDiagnosis Gaucher DataFrame:")
print(diagnosis_df_gaucher.head())

Clinical ASMD DataFrame:
   PERSONID   ENCNTRID      ORDERID  CLINICALEVENTID          EVENTDATETIME  \
0  13542357  458239816            0      30958268048   13/MAR/2023 12:02:00   
1  13542357  458239816  11750667631      30959845663   13/MAR/2023 10:45:00   
2  13542357  458239816            0      30957028397   13/MAR/2023 10:29:00   
3  13542357  458239816            0      30957029814   13/MAR/2023 10:29:00   
4  13542357  458239816            0      30957030613   13/MAR/2023 10:29:00   

                              EVENTNAME   EVENTRESULT RESULTUNIT  \
0  Barriers to Learning Edu Pharmacists  None evident        NaN   
1                         acetaminophen   1000.000000         mg   
2                      DCP GENERIC CODE           NaN        NaN   
3                    Sensory Impairment            No        NaN   
4                        Food Allergies            No        NaN   

                                  TASKASSAY  
0  Barriers to Learning Education Pharmacis  

In [7]:
# Defining reference ranges for CBC parameters
cbc_reference = {
    'WBC': (4.5, 11),
    'Hgb': (132, 173),
    'Platelet': (140, 400),
    'Neutro Auto #': (1.8, 7.7)
}

# Function to check if CBC parameters are beyond the reference levels
def check_cbc_parameters(person_id, labs_df):
    patient_labs = labs_df[labs_df['PERSONID'] == person_id]
    for param, (low, high) in cbc_reference.items():
        param_labs = patient_labs[patient_labs['TASKASSAY'] == param]
        if not param_labs.empty and (any(param_labs['RESULTVALUE'].astype(float) < low) or any(param_labs['RESULTVALUE'].astype(float) > high)):
            return True
    return False

# Function to check Rule 1
def check_rule_1(person_id, diagnosis_df, labs_df):
    patient_diagnosis = diagnosis_df[diagnosis_df['PERSONID'] == person_id]
    conditions = ['R19.0', 'R16.2', 'R16.0', 'R16.1']
    if any(patient_diagnosis['ICDCODE'].isin(conditions)) and check_cbc_parameters(person_id, labs_df):
        return True
    return False

# Function to check Rule 2
def check_rule_2(person_id, diagnosis_df, labs_df):
    patient_diagnosis = diagnosis_df[diagnosis_df['PERSONID'] == person_id]
    if any(patient_diagnosis['ICDCODE'] == 'R62.50') and check_rule_1(person_id, diagnosis_df, labs_df):
        return True
    return False

# Function to check Rule 3
def check_rule_3(person_id, diagnosis_df, labs_df, clinical_df):
    patient_diagnosis = diagnosis_df[diagnosis_df['PERSONID'] == person_id]
    patient_clinical = clinical_df[clinical_df['PERSONID'] == person_id].copy()  # Make a copy to avoid SettingWithCopyWarning
    conditions = ['R62.50', 'R62.7']

# Converting 'EVENTRESULT' to numeric, coercing errors to NaN
    patient_clinical['EVENTRESULT'] = pd.to_numeric(patient_clinical['EVENTRESULT'], errors='coerce')

# Incorporating height and weight
    height_low_percentile = patient_clinical[(patient_clinical['EVENTNAME'].str.contains('Height')) & (patient_clinical['EVENTRESULT'] < 3)]
    weight_low_percentile = patient_clinical[(patient_clinical['EVENTNAME'].str.contains('Weight')) & (patient_clinical['EVENTRESULT'] < 3)]

    if (any(patient_diagnosis['ICDCODE'].isin(conditions)) or
        not height_low_percentile.empty or
        not weight_low_percentile.empty) and check_rule_1(person_id, diagnosis_df, labs_df) and check_rule_2(person_id, diagnosis_df, labs_df):
        return True
    return False

# Function to check Rule 4
def check_rule_4(person_id, diagnosis_df, labs_df):
    patient_diagnosis = diagnosis_df[diagnosis_df['PERSONID'] == person_id]
    if any(patient_diagnosis['ICDCODE'] == 'J84.9') and check_rule_1(person_id, diagnosis_df, labs_df):
        return True
    return False

# Function to check Rule 5
def check_rule_5(person_id, diagnosis_df, labs_df):
    patient_diagnosis = diagnosis_df[diagnosis_df['PERSONID'] == person_id]
    conditions = ['P94.2', 'R62.50', 'R62.59']
    if check_rule_1(person_id, diagnosis_df, labs_df) and any(patient_diagnosis['ICDCODE'].isin(conditions)):
        return True
    return False

In [8]:
# Getting unique patient IDs from ASMD and Gaucher datasets
patient_ids_asmd = diagnosis_df_asmd['PERSONID'].unique()
patient_ids_gaucher = diagnosis_df_gaucher['PERSONID'].unique()

# Creating dataframes to store the results for ASMD and Gaucher
results_asmd_list = []
results_gaucher_list = []

# Checking each patient against the rules for ASMD dataset
for person_id in patient_ids_asmd:
    rule1 = check_rule_1(person_id, diagnosis_df_asmd, labs_df_asmd)
    rule2 = check_rule_2(person_id, diagnosis_df_asmd, labs_df_asmd)
    rule3 = check_rule_3(person_id, diagnosis_df_asmd, labs_df_asmd, clinical_df_asmd)
    rule4 = check_rule_4(person_id, diagnosis_df_asmd, labs_df_asmd)
    rule5 = check_rule_5(person_id, diagnosis_df_asmd, labs_df_asmd)

    results_asmd_list.append({
        'PERSONID': person_id,
        'Rule1': rule1,
        'Rule2': rule2,
        'Rule3': rule3,
        'Rule4': rule4,
        'Rule5': rule5
    })

# Creating dataframe from results list
results_asmd = pd.DataFrame(results_asmd_list)

# Checking each patient against the rules for Gaucher dataset
for person_id in patient_ids_gaucher:
    rule1 = check_rule_1(person_id, diagnosis_df_gaucher, labs_df_gaucher)
    rule2 = check_rule_2(person_id, diagnosis_df_gaucher, labs_df_gaucher)
    rule3 = check_rule_3(person_id, diagnosis_df_gaucher, labs_df_gaucher, clinical_df_gaucher)
    rule4 = check_rule_4(person_id, diagnosis_df_gaucher, labs_df_gaucher)
    rule5 = check_rule_5(person_id, diagnosis_df_gaucher, labs_df_gaucher)

    results_gaucher_list.append({
        'PERSONID': person_id,
        'Rule1': rule1,
        'Rule2': rule2,
        'Rule3': rule3,
        'Rule4': rule4,
        'Rule5': rule5
    })

# Creating dataframe from results list
results_gaucher = pd.DataFrame(results_gaucher_list)

# Saving the results to a CSV file
results_asmd.to_csv('results_asmd_with_rules.csv', index=False)
results_gaucher.to_csv('results_gaucher_with_rules.csv', index=False)

# Downloading the files from Colab
from google.colab import files
files.download('results_asmd_with_rules.csv')
files.download('results_gaucher_with_rules.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import pandas as pd

# Load the results CSV files
results_asmd = pd.read_csv('/content/drive/MyDrive/dataset/results_asmd_with_rules.csv')
results_gaucher = pd.read_csv('/content/drive/MyDrive/dataset/results_gaucher_with_rules.csv')

# Load the clinical and labs data for ASMD and Gaucher
clinical_asmd = pd.read_csv('/content/drive/MyDrive/dataset/Clinical.csv')
clinical_gaucher = pd.read_csv('/content/drive/MyDrive/dataset/Clinical_gaucher.csv')
labs_asmd = pd.read_csv('/content/drive/MyDrive/dataset/Labs.csv')
labs_gaucher = pd.read_csv('/content/drive/MyDrive/dataset/Labs_gaucher.csv')

results_asmd.head(), results_gaucher.head(), clinical_asmd.head(), clinical_gaucher.head(), labs_asmd.head(), labs_gaucher.head()


(   PERSONID  Rule1  Rule2  Rule3  Rule4  Rule5
 0  13542357   True  False  False  False  False
 1  15714196   True   True   True  False   True
 2   1670996   True  False  False   True  False
 3  18057960  False  False  False  False  False
 4   1888298  False  False  False  False  False,
    PERSONID  Rule1  Rule2  Rule3  Rule4  Rule5
 0  10302581  False  False  False  False  False
 1  14119898  False  False  False  False  False
 2  15003810  False  False  False  False  False
 3  15621075  False  False  False  False  False
 4   1662332  False  False  False  False  False,
    PERSONID   ENCNTRID      ORDERID  CLINICALEVENTID          EVENTDATETIME  \
 0  13542357  458239816            0      30958268048   13/MAR/2023 12:02:00   
 1  13542357  458239816  11750667631      30959845663   13/MAR/2023 10:45:00   
 2  13542357  458239816            0      30957028397   13/MAR/2023 10:29:00   
 3  13542357  458239816            0      30957029814   13/MAR/2023 10:29:00   
 4  13542357  45823981

In [11]:
# Select sample patients who triggered Rule 3
sample_patient_id_asmd = results_asmd[results_asmd['Rule3'] == True].iloc[0]['PERSONID']
sample_patient_id_gaucher = results_gaucher[results_gaucher['Rule3'] == True].iloc[0]['PERSONID']

# Check clinical data for these patients
clinical_data_asmd = clinical_asmd[clinical_asmd['PERSONID'] == sample_patient_id_asmd]
clinical_data_gaucher = clinical_gaucher[clinical_gaucher['PERSONID'] == sample_patient_id_gaucher]

clinical_data_asmd, clinical_data_gaucher

(         PERSONID   ENCNTRID     ORDERID  CLINICALEVENTID  \
 5217     15714196   64358866           0       5709650576   
 5218     15714196   64358866           0       5709650603   
 5219     15714196   64358866  1164654742       5842397052   
 5220     15714196   64358866           0       5596727085   
 5221     15714196   64358866           0       5596727091   
 ...           ...        ...         ...              ...   
 1055747  15714196  451009712           0      30465388109   
 1055748  15714196  451009712           0      30465388097   
 1055749  15714196  451009712           0      30465388117   
 1055750  15714196  451009712           0      30465388045   
 1055751  15714196  451009712           0      30564725033   
 
                  EVENTDATETIME                             EVENTNAME  \
 5217      24/OCT/2013 09:17:00  Endotracheal Tube Insertion Distance   
 5218      24/OCT/2013 09:17:00              Endotracheal Tube Status   
 5219      17/NOV/2013 04:20:00    

In [12]:
# Convert EVENTRESULT to numeric for the selected patients
clinical_data_asmd['EVENTRESULT'] = pd.to_numeric(clinical_data_asmd['EVENTRESULT'], errors='coerce')
clinical_data_gaucher['EVENTRESULT'] = pd.to_numeric(clinical_data_gaucher['EVENTRESULT'], errors='coerce')

# Check height and weight percentiles
height_low_percentile_asmd = clinical_data_asmd[(clinical_data_asmd['EVENTNAME'].str.contains('Height')) & (clinical_data_asmd['EVENTRESULT'] < 3)]
weight_low_percentile_asmd = clinical_data_asmd[(clinical_data_asmd['EVENTNAME'].str.contains('Weight')) & (clinical_data_asmd['EVENTRESULT'] < 3)]

height_low_percentile_gaucher = clinical_data_gaucher[(clinical_data_gaucher['EVENTNAME'].str.contains('Height')) & (clinical_data_gaucher['EVENTRESULT'] < 3)]
weight_low_percentile_gaucher = clinical_data_gaucher[(clinical_data_gaucher['EVENTNAME'].str.contains('Weight')) & (clinical_data_gaucher['EVENTRESULT'] < 3)]

height_low_percentile_asmd, weight_low_percentile_asmd, height_low_percentile_gaucher, weight_low_percentile_gaucher

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data_asmd['EVENTRESULT'] = pd.to_numeric(clinical_data_asmd['EVENTRESULT'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data_gaucher['EVENTRESULT'] = pd.to_numeric(clinical_data_gaucher['EVENTRESULT'], errors='coerce')


(         PERSONID   ENCNTRID  ORDERID  CLINICALEVENTID          EVENTDATETIME  \
 32508    15714196   90033255        0       8484211251   17/FEB/2015 17:03:00   
 87557    15714196  263809070        0      23214229795   01/NOV/2020 10:05:00   
 90854    15714196  231345670        0      22187166581   02/JUN/2020 14:00:00   
 91057    15714196  470369488        0      31800963847   15/JUN/2023 10:39:00   
 92831    15714196  486462086        0      32941351955   20/OCT/2023 15:12:00   
 ...           ...        ...      ...              ...                    ...   
 1031424  15714196  330660189        0      25434986353   13/JUL/2021 11:25:00   
 1035775  15714196  323650173        0      25192783183   20/JUN/2021 11:33:00   
 1049024  15714196  204421791        0      20546359912   30/OCT/2019 18:52:00   
 1053686  15714196  214230298        0      21197124973   16/JAN/2020 12:17:00   
 1054618  15714196  422120507        0      28770968580   16/JUL/2022 05:00:00   
 
              

In [13]:
# Select sample patients who triggered Rule 1
sample_patient_id_asmd = results_asmd[results_asmd['Rule1'] == True].iloc[0]['PERSONID']
sample_patient_id_gaucher = results_gaucher[results_gaucher['Rule1'] == True].iloc[0]['PERSONID']

# Check labs data for these patients
labs_data_asmd = labs_asmd[labs_asmd['PERSONID'] == sample_patient_id_asmd]
labs_data_gaucher = labs_gaucher[labs_gaucher['PERSONID'] == sample_patient_id_gaucher]

labs_data_asmd, labs_data_gaucher

(     PERSONID   ENCNTRID      ORDERID                 ORDERCATALOG  \
 0    13542357   28613479    402175483        Coagulation Screen TW   
 1    13542357   28613479    402175483        Coagulation Screen TW   
 2    13542357   28613479    402175483        Coagulation Screen TW   
 3    13542357   28613479    402175483        Coagulation Screen TW   
 4    13542357   28613479    402175585                CBC w/Diff TW   
 ..        ...        ...          ...                          ...   
 410  13542357  481571259  12841139889             Retic Count Auto   
 411  13542357  481571259  12841139889             Retic Count Auto   
 412  13542357  481571259  12841139969  Thyroid Stimulating Hormone   
 413  13542357  481571259  12841140165   Vitamin D 25 Hydroxy Level   
 414  13542357  481571259  12841151633            Vitamin B12 Level   
 
                  ORDERDATE    RESULTID RESULTVALUEALPHA  RESULTVALUE  \
 0     29/MAR/2011 10:31:17   122543692              NaN         9.20   


In [14]:
# Define CBC parameters
cbc_parameters = ['WBC', 'Hgb', 'Platelet', 'Neutro Auto #']

# Verify exact matches for CBC parameters for ASMD
exact_matches_asmd = {param: not labs_data_asmd[labs_data_asmd['TASKASSAY'] == param].empty for param in cbc_parameters}

# Verify exact matches for CBC parameters for Gaucher
exact_matches_gaucher = {param: not labs_data_gaucher[labs_data_gaucher['TASKASSAY'] == param].empty for param in cbc_parameters}

exact_matches_asmd, exact_matches_gaucher

({'WBC': True, 'Hgb': True, 'Platelet': True, 'Neutro Auto #': True},
 {'WBC': True, 'Hgb': True, 'Platelet': True, 'Neutro Auto #': True})

In [18]:
import pandas as pd

# Loading the results CSV files for verification purpose
results_asmd = pd.read_csv('/content/drive/MyDrive/dataset/results_asmd_with_rules.csv')
results_gaucher = pd.read_csv('/content/drive/MyDrive/dataset/results_gaucher_with_rules.csv')

# Loading the clinical and labs data for ASMD and Gaucher
clinical_asmd = pd.read_csv('/content/drive/MyDrive/dataset/Clinical.csv')
clinical_gaucher = pd.read_csv('/content/drive/MyDrive/dataset/Clinical_gaucher.csv')
labs_asmd = pd.read_csv('/content/drive/MyDrive/dataset/Labs.csv')
labs_gaucher = pd.read_csv('/content/drive/MyDrive/dataset/Labs_gaucher.csv')

# Function to check Rule 3 implementation
def verify_rule_3(data, clinical_data):
    verified_patients = []
    for index, row in data.iterrows():
        if row['Rule3']:
            person_id = row['PERSONID']
            patient_clinical = clinical_data[clinical_data['PERSONID'] == person_id].copy()
            patient_clinical['EVENTRESULT'] = pd.to_numeric(patient_clinical['EVENTRESULT'], errors='coerce')

            height_low_percentile = patient_clinical[(patient_clinical['EVENTNAME'].str.contains('Height')) & (patient_clinical['EVENTRESULT'] < 3)]
            weight_low_percentile = patient_clinical[(patient_clinical['EVENTNAME'].str.contains('Weight')) & (patient_clinical['EVENTRESULT'] < 3)]

            if not height_low_percentile.empty or not weight_low_percentile.empty:
                verified_patients.append(person_id)

    return verified_patients

# Verifying Rule 3 for ASMD
verified_asmd_rule3 = verify_rule_3(results_asmd, clinical_asmd)
# Verifying Rule 3 for Gaucher
verified_gaucher_rule3 = verify_rule_3(results_gaucher, clinical_gaucher)

# Function to check if CBC parameters are matched exactly
def verify_cbc_parameters(data, labs_data):
    verified_patients = []
    for index, row in data.iterrows():
        if row['Rule1']:
            person_id = row['PERSONID']
            patient_labs = labs_data[labs_data['PERSONID'] == person_id]
            exact_matches = True

            for param in ['WBC', 'Hgb', 'Platelet', 'Neutro Auto #']:
                param_labs = patient_labs[patient_labs['TASKASSAY'] == param]
                if param_labs.empty:
                    exact_matches = False
                    break

            if exact_matches:
                verified_patients.append(person_id)

    return verified_patients

# Verifying CBC parameters for ASMD
verified_asmd_cbc = verify_cbc_parameters(results_asmd, labs_asmd)
# Verifying CBC parameters for Gaucher
verified_gaucher_cbc = verify_cbc_parameters(results_gaucher, labs_gaucher)

# Adding a verification column for Rule 3 and CBC parameters
results_asmd['Rule3_Verified'] = results_asmd['PERSONID'].isin(verified_asmd_rule3)
results_gaucher['Rule3_Verified'] = results_gaucher['PERSONID'].isin(verified_gaucher_rule3)

results_asmd['CBC_Verified'] = results_asmd['PERSONID'].isin(verified_asmd_cbc)
results_gaucher['CBC_Verified'] = results_gaucher['PERSONID'].isin(verified_gaucher_cbc)

# Saving the updated ASMD results to a new CSV file
results_asmd.to_csv('/content/drive/MyDrive/dataset/verified_results_asmd_with_rules.csv', index=False)

# Saving the updated Gaucher results to a new CSV file
results_gaucher.to_csv('/content/drive/MyDrive/dataset/verified_results_gaucher_with_rules.csv', index=False)

# Downloading the verified ASMD results CSV file
from google.colab import files
files.download('/content/drive/MyDrive/dataset/verified_results_asmd_with_rules.csv')

# Downloading the verified Gaucher results CSV file
files.download('/content/drive/MyDrive/dataset/verified_results_gaucher_with_rules.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>