In [12]:
import pandas as pd
import numpy as np
from glob import glob
import os

HOME_DIR = "/home/jovyan"
DATA_DIR = os.path.join(HOME_DIR, "arc-sg/data")
OUTPUT_DIR = os.path.join(HOME_DIR, "arc-sg/outputs")

## Collate 5 years of lab data
Rerun as files are added in batches (space issues)

In [71]:
FIELD = {
    "ID": "Patient ID",
    "RACE": "Race",
    "DOB": "Date of Birth",
    "Nationality": "Nationality",
    "INSTITUTION": "Institution Code",
    "TEST_NAME": "Lab Resulted Order Test Description",
    "RESULT": "Result Value",
    "DATE": "Specimen Collection Date",
}

In [72]:
FIELD.values()

dict_values(['Patient ID', 'Race', 'Date of Birth', 'Nationality', 'Institution Code', 'Lab Resulted Order Test Description', 'Result Value', 'Specimen Collection Date'])

In [47]:
def aggregate_csvs(input_dir, curr_df = pd.DataFrame(), usecols=[]):
    files = glob(input_dir)
    df_list = []
    df_list = [pd.read_csv(file, usecols=usecols) for file in files]
    df_all  = pd.concat([curr_df, *df_list], ignore_index=True)
    return df_all

In [45]:
input_dir = os.path.join(DATA_DIR, "SGH 2015-2022", "*.csv")
usecols = ['Patient ID','Gender', 'Date of Birth','Specimen Received Date', 'Lab Resulted Order Test Description','Result Value' ] # temporarily
# usecols = FIELD.values()
labs_df = pd.DataFrame()

In [62]:
labs_df = aggregate_csvs(os.path.join(DATA_DIR, "SGH 2015-2022", "*.csv"), labs_df, usecols)

In [64]:
labs_path = os.path.join(OUTPUT_DIR, "labs_2015-2022(2).pkl")
labs_df.to_pickle(labs_path)

In [65]:
labs_df

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
0,7f7a4ad5417e81870f61,FEMALE,1948-05-01,2015-11-07,CA 19-9,120
1,7f7a4ad5417e81870f61,FEMALE,1948-05-01,2015-11-07,BICARBONATE,22.6
2,7f7a4ad5417e81870f61,FEMALE,1948-05-01,2015-11-07,CHLORIDE,101
3,7f7a4ad5417e81870f61,FEMALE,1948-05-01,2015-11-07,CREATININE,61
4,7f7a4ad5417e81870f61,FEMALE,1948-05-01,2015-11-07,GLUCOSE,7.6
...,...,...,...,...,...,...
42295993,7a8e18da55a32ca69917,FEMALE,1928-01-01,2022-09-30,GLUCOSE,8.4
42295994,7a8e18da55a32ca69917,FEMALE,1928-01-01,2022-09-30,POTASSIUM,4.0
42295995,7a8e18da55a32ca69917,FEMALE,1928-01-01,2022-09-30,SODIUM,130
42295996,7a8e18da55a32ca69917,FEMALE,1928-01-01,2022-09-30,UREA,9.9


## Lab test exploration

In [None]:
labs_path = os.path.join(OUTPUT_DIR, "labs_2015-2022.pkl")
labs_df = pd.read_pickle(labs_path)

In [74]:
glucose_mask = labs_df[FIELD["TEST_NAME"]].str.contains("glucose", case=False, na=False) & labs_df[FIELD["TEST_NAME"]].str.contains("fasting", case=False, na=False)

In [79]:
ldl_mask = labs_df[FIELD["TEST_NAME"]].str.contains("ldl", case=False, na=False) # 

In [76]:
fasting_glucose = labs_df[mask]

In [78]:
fasting_glucose[FIELD["TEST_NAME"]].unique()

array(['GLUCOSE,PLASMA FASTING', 'GLUCOSE FASTING'], dtype=object)

In [80]:
ldl = labs_df[ldl_mask]

In [81]:
ldl[FIELD["TEST_NAME"]].unique()

array(['LDL-CHOLESTEROL,CALCULATED', 'CHOLESTEROL,TG,HDL,LDL',
       'LDL-CHOLESTEROL,DIRECT'], dtype=object)

In [85]:
test = ldl[ldl[FIELD["TEST_NAME"]]=="CHOLESTEROL,TG,HDL,LDL"]
test[FIELD["RESULT"]].unique()

array([nan], dtype=object)

In [86]:
labs_df[labs_df[FIELD["TEST_NAME"]].str.contains("triglycerides", case=False, na=False)]

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
2819,933608e368815cc46896,MALE,1988-01-01,2015-12-02,TRIGLYCERIDES,1.99
5002,43865135afb39167c7b6,FEMALE,1950-08-01,2015-11-29,TRIGLYCERIDES,2.37
5127,8fac02779c8f35d03a19,FEMALE,1950-04-01,2015-12-01,TRIGLYCERIDES,0.96
5374,fd942746204678bc2b27,MALE,1992-03-01,2015-11-05,TRIGLYCERIDES,0.61
6696,0e9d0d2794f14ab8995c,MALE,1978-04-01,2015-11-28,TRIGLYCERIDES,1.75
...,...,...,...,...,...,...
42290350,f2e95d47f7e2f7211507,FEMALE,1946-07-01,2022-09-21,TRIGLYCERIDES,1.44
42291821,0cf707bc69dca7f313af,MALE,1935-04-01,2022-09-22,TRIGLYCERIDES,0.80
42293177,df5120b92e704b35276b,MALE,1944-06-01,2022-09-10,TRIGLYCERIDES,1.03
42294327,0ba5e7c518f71f9fe6ac,FEMALE,1932-08-01,2022-09-27,TRIGLYCERIDES,0.66


## Lab test filtering

In [90]:
glucose_mask = labs_df[FIELD["TEST_NAME"]].str.contains("glucose", case=False, na=False) & labs_df[FIELD["TEST_NAME"]].str.contains("fasting", case=False, na=False)
other_tests_mask = labs_df[FIELD["TEST_NAME"]].str.contains('HBA1C|TRIGLYCERIDES|LDL-CHOLESTEROL,CALCULATED', case=False, na=False)

In [92]:
filtered_labs_df = labs_df
filtered_labs_df['Lab Resulted Order Test Description'] = labs_df['Lab Resulted Order Test Description'].str.replace('GLUCOSE,PLASMA FASTING', 'GLUCOSE FASTING', case=False, regex=True)
filtered_labs_df = filtered_labs_df[filtered_labs_df['Lab Resulted Order Test Description'].str.contains('HBA1C|TRIGLYCERIDES|LDL-CHOLESTEROL,CALCULATED|GLUCOSE FASTING', case=False, na=False)]

In [97]:
filtered_labs_df.to_csv(os.path.join(OUTPUT_DIR, "filtered_labs.csv"), index=False)

## Counting number of tests for within start (DM dx date) and end (Cancer dx date)

In [100]:
filtered_labs_df[FIELD["TEST_NAME"]].unique()

array(['HBA1C', 'GLUCOSE FASTING', 'LDL-CHOLESTEROL,CALCULATED',
       'TRIGLYCERIDES', 'TRIGLYCERIDES,FLUID', 'TRIGLYCERIDES,URINE',
       'HBA1C IFCC', 'HBA1c, blood (dual reporting)'], dtype=object)

In [102]:
tests = {
    "Fasting Glucose": "glucose",
    "HbA1c": "hba1c",
    "Triglycerides": "triglycerides",
    "LDL": "ldl"
}

In [98]:
def get_DM_dx(df):
    # Determine if currently has diabetes
    is_diabetic = True
    
    # Determine if there are values to be excluded if there is a DM date within the lab values
    sorted_df = df.sort_values(by=FIELD["DATE"])
    DM_dx_dt = df[FIELD["DATE"]].iloc[0]
    return is_diabetic, DM_dx_dt

In [105]:
result = {
        "meets_criteria": True
    }
    
for key in tests:
    result.update({key: [], key + " Dates": []})

In [106]:
result

{'meets_criteria': True,
 'Fasting Glucose': [],
 'Fasting Glucose Dates': [],
 'HbA1c': [],
 'HbA1c Dates': [],
 'Triglycerides': [],
 'Triglycerides Dates': [],
 'LDL': [],
 'LDL Dates': []}

In [99]:
def get_cancer_dx(df):
    cancers = []
    dts = []
    return cancers, dts

In [None]:
def agg_pt_details(grp):
    # Initialise dict
    result = {"meets_criteria": True, 
              "CANCER": [],
              "CANCER Date": []}
    for key in tests():
        result.update({key: [], key + " Dates": []})
    
    # Check for diabetes, return if no diabetes
    is_diabetic, DM_dx_dt = get_DM_dx(grp)
    if is_diabetic is False:
        result["meets_criteria"] = False
        return pd.Series(result)

    # Check for cancer
    cancers, cancer_dts = get_cancer_dx(df)
    earliest_cancer_dt = None
    if (cancer_dts.len > 0):
        earliest_cancer_dt = sorted(cancer_dt)[0]
 
    # Extract tests within given window
    for key, val in tests.items():
        result.update({key: [], key + " Dates": []})
        
    end_dt = 

In [None]:
pt_df = filtered_labs_df.groupby([FIELD["ID"]]).apply(agg_pt_details)

## Load and filter diagnosis data

In [67]:
usecols = []
diag_df = pd.DataFrame()

In [68]:
diag_df = aggregate_csvs(os.path.join(HOME_DIR, "elpha-data/diagnosis", "*.csv"), diag_df, usecols)

ValueError: Usecols do not match columns, columns expected but not found: ['Lab Resulted Order Test Description', 'Specimen Received Date', 'Result Value']