In [1]:
import pandas as pd
import numpy as np
from glob import glob
import os
from pandas.tseries.offsets import DateOffset

HOME_DIR = "/home/jovyan"
DATA_DIR = os.path.join(HOME_DIR, "arc-sg/data")
OUTPUT_DIR = os.path.join(HOME_DIR, "arc-sg/outputs")

## Collate 5 years of lab data
Rerun as files are added in batches (space issues)

In [2]:
FIELD = {
    "ID": "Patient ID",
    "RACE": "Race",
    "DOB": "Date of Birth",
    "Nationality": "Nationality",
    "INSTITUTION": "Institution Code",
    "TEST_NAME": "Lab Resulted Order Test Description",
    "RESULT": "Result Value",
    "DATE": "Specimen Received Date" # "Specimen Collection Date",
}

In [3]:
FIELD.values()

dict_values(['Patient ID', 'Race', 'Date of Birth', 'Nationality', 'Institution Code', 'Lab Resulted Order Test Description', 'Result Value', 'Specimen Received Date'])

In [4]:
def aggregate_csvs(input_dir, curr_df = pd.DataFrame(), usecols=[]):
    files = glob(input_dir)
    df_list = []
    df_list = [pd.read_csv(file, usecols=usecols) for file in files]
    df_all  = pd.concat([curr_df, *df_list], ignore_index=True)
    return df_all

In [5]:
input_dir = os.path.join(DATA_DIR, "SGH 2015-2022", "*.csv")
usecols = ['Patient ID','Gender', 'Date of Birth','Specimen Received Date', 'Lab Resulted Order Test Description','Result Value' ] # temporarily
# usecols = FIELD.values()
labs_df = pd.DataFrame()

In [6]:
labs_df = aggregate_csvs(os.path.join(DATA_DIR, "SGH 2015-2022", "*.csv"), labs_df, usecols)

In [7]:
labs_path = os.path.join(OUTPUT_DIR, "labs_2015-2022(2).pkl")
labs_df.to_pickle(labs_path)

In [8]:
labs_df

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
0,e01cba745894ac232087,FEMALE,1948-10-01,2022-04-29,APTT,28.5
1,e01cba745894ac232087,FEMALE,1948-10-01,2022-04-29,APTT,
2,e01cba745894ac232087,FEMALE,1948-10-01,2022-04-29,"GLUCOSE,POCT",13.2
3,e01cba745894ac232087,FEMALE,1948-10-01,2022-04-29,"GLUCOSE,POCT",13.3
4,e01cba745894ac232087,FEMALE,1948-10-01,2022-04-29,"GLUCOSE,POCT",14.7
...,...,...,...,...,...,...
7452178,adcbf7dedb993df1f565,FEMALE,1998-02-01,2022-10-04,RBC DIST WIDTH,16.7
7452179,adcbf7dedb993df1f565,FEMALE,1998-02-01,2022-10-04,TOTAL ABS COUNT,6.67
7452180,adcbf7dedb993df1f565,FEMALE,1998-02-01,2022-10-04,TOTAL CELL COUNT,100.0
7452181,adcbf7dedb993df1f565,FEMALE,1998-02-01,2022-10-04,WBC,6.67


## Lab test exploration

In [9]:
glucose_mask = labs_df[FIELD["TEST_NAME"]].str.contains("glucose", case=False, na=False) & labs_df[FIELD["TEST_NAME"]].str.contains("fasting", case=False, na=False)

In [10]:
ldl_mask = labs_df[FIELD["TEST_NAME"]].str.contains("ldl", case=False, na=False) # 

In [12]:
fasting_glucose = labs_df[glucose_mask]

In [13]:
fasting_glucose[FIELD["TEST_NAME"]].unique()

array(['GLUCOSE,PLASMA FASTING', 'GLUCOSE FASTING'], dtype=object)

In [14]:
ldl = labs_df[ldl_mask]

In [15]:
ldl[FIELD["TEST_NAME"]].unique()

array(['LDL-CHOLESTEROL,CALCULATED', 'CHOLESTEROL,TG,HDL,LDL',
       'LDL-CHOLESTEROL,DIRECT'], dtype=object)

## Lab test filtering

In [16]:
test = ldl[ldl[FIELD["TEST_NAME"]]=="CHOLESTEROL,TG,HDL,LDL"]
test[FIELD["RESULT"]].unique()

array([nan], dtype=object)

In [17]:
labs_df[labs_df[FIELD["TEST_NAME"]].str.contains("triglycerides", case=False, na=False)]

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
266,5ba30677eb2e5c1c92aa,FEMALE,1957-02-01,2022-04-09,TRIGLYCERIDES,1.54
994,0f01de635ab6d46dd8f7,FEMALE,1955-04-01,2022-05-12,TRIGLYCERIDES,1.17
1046,412161ca4033a5b1e89f,FEMALE,1935-12-01,2022-04-20,TRIGLYCERIDES,1.12
1262,643e24c370bc88a73094,FEMALE,1926-04-01,2022-04-25,TRIGLYCERIDES,0.73
1330,2b77d04e272689d98bbd,FEMALE,1949-08-01,2022-04-10,TRIGLYCERIDES,1.16
...,...,...,...,...,...,...
7443452,a1daf818f86497ea9a13,MALE,1983-09-01,2022-09-26,TRIGLYCERIDES,0.92
7443536,899858769c6500b3189e,FEMALE,1950-05-01,2022-09-26,TRIGLYCERIDES,1.53
7444187,4f57812b1a9231071221,MALE,1939-02-01,2022-10-02,TRIGLYCERIDES,0.79
7445799,b9ce3c1a7b3ff8ba0347,MALE,1943-11-01,2022-09-15,TRIGLYCERIDES,0.46


In [23]:
labs_path = os.path.join(OUTPUT_DIR, "labs_2015-2022.pkl")
labs_df = pd.read_pickle(labs_path)

In [24]:
glucose_mask = labs_df[FIELD["TEST_NAME"]].str.contains("glucose", case=False, na=False) & labs_df[FIELD["TEST_NAME"]].str.contains("fasting", case=False, na=False)
other_tests_mask = labs_df[FIELD["TEST_NAME"]].str.contains('HBA1C|TRIGLYCERIDES|LDL-CHOLESTEROL,CALCULATED|LDL-CHOLESTEROL,DIRECT', case=False, na=False)

In [25]:
filtered_labs_df = labs_df
filtered_labs_df['Lab Resulted Order Test Description'] = labs_df['Lab Resulted Order Test Description'].str.replace('GLUCOSE,PLASMA FASTING', 'GLUCOSE FASTING', case=False, regex=True)
filtered_labs_df = filtered_labs_df[filtered_labs_df['Lab Resulted Order Test Description'].str.contains('HBA1C|TRIGLYCERIDES|LDL-CHOLESTEROL,CALCULATED|LDL-CHOLESTEROL,DIRECT|GLUCOSE FASTING', case=False, na=False)]

In [27]:
##converting all LDL labels to LAD-CHOLESTEROL
filtered_labs_df['Lab Resulted Order Test Description'] = filtered_labs_df['Lab Resulted Order Test Description'].str.replace('LDL-CHOLESTEROL,CALCULATED|LDL-CHOLESTEROL,DIRECT', 'LDL-CHOLESTEROL', case=False,regex=True)
filtered_labs_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_labs_df['Lab Resulted Order Test Description'] = filtered_labs_df['Lab Resulted Order Test Description'].str.replace('LDL-CHOLESTEROL,CALCULATED|LDL-CHOLESTEROL,DIRECT', 'LDL-CHOLESTEROL', case=False,regex=True)


Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
1849,19b1539322147af94e95,FEMALE,1926-03-01,2015-11-23,HBA1C,7.0
2181,c1926d2f0b958145121e,MALE,1965-10-01,2015-11-03,HBA1C,6.2
2230,dff6f178b1dc7a7240d9,FEMALE,1971-06-01,2015-11-27,HBA1C,6.1
2502,1c12adf8538a182f2d24,MALE,1970-01-01,2015-11-25,GLUCOSE FASTING,9.3
2563,f5e124aa01edd126ac74,MALE,1960-10-01,2015-11-24,HBA1C,10.3
...,...,...,...,...,...,...
42295175,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,HBA1C IFCC,41
42295178,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,LDL-CHOLESTEROL,2.59
42295179,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,TRIGLYCERIDES,1.96
42295294,75912433644a57c99679,FEMALE,1939-07-01,2022-09-09,HBA1C,8.1


In [29]:
filtered_labs_df.to_csv(os.path.join(OUTPUT_DIR, "filtered_labs.csv"), index=False)

#Finalising HbA1C and Triglyceride data

In [33]:
filtered_labs_df = pd.read_csv(os.path.join(OUTPUT_DIR, "filtered_labs.csv"))
filtered_labs_df[FIELD["DATE"]] = pd.to_datetime(filtered_labs_df[FIELD["DATE"]])

In [41]:
filtered_labs_df[FIELD["TEST_NAME"]].unique()

array(['HBA1C', 'GLUCOSE FASTING', 'LDL-CHOLESTEROL', 'TRIGLYCERIDES',
       'TRIGLYCERIDES,FLUID', 'TRIGLYCERIDES,URINE', 'HBA1C IFCC',
       'HBA1c, blood (dual reporting)'], dtype=object)

In [67]:
filtered_labs_df = filtered_labs_df[~filtered_labs_df['Lab Resulted Order Test Description'].str.contains('TRIGLYCERIDES,FLUID | TRIGLYCERIDES,URINE', case=False, na=False)]

In [69]:
filtered_labs_df['Lab Resulted Order Test Description'].unique()

array(['HBA1C', 'GLUCOSE FASTING', 'LDL-CHOLESTEROL', 'TRIGLYCERIDES',
       'TRIGLYCERIDES,FLUID', 'TRIGLYCERIDES,URINE', 'HBA1C IFCC',
       'HBA1c, blood (dual reporting)'], dtype=object)

# Convert HbA1C IFCC values into percentage values

In [129]:
filtered_labs_df['Result Value']=pd.to_numeric(filtered_labs_df['Result Value'], errors='coerce')
for index, value in filtered_labs_df['Lab Resulted Order Test Description'].items():
    if value == 'HBA1C IFCC':
        filtered_labs_df.at[index, 'Result Value'] = 0.09148* filtered_labs_df.at[index, 'Result Value']+ 2.152
        filtered_labs_df.at[index, 'Lab Resulted Order Test Description'] = 'HBA1C'
        
filtered_labs_df

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
0,19b1539322147af94e95,FEMALE,1926-03-01,2015-11-23,HBA1C,7.00000
1,c1926d2f0b958145121e,MALE,1965-10-01,2015-11-03,HBA1C,6.20000
2,dff6f178b1dc7a7240d9,FEMALE,1971-06-01,2015-11-27,HBA1C,6.10000
3,1c12adf8538a182f2d24,MALE,1970-01-01,2015-11-25,GLUCOSE FASTING,9.30000
4,f5e124aa01edd126ac74,MALE,1960-10-01,2015-11-24,HBA1C,10.30000
...,...,...,...,...,...,...
191132,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,HBA1C,5.90268
191133,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,LDL-CHOLESTEROL,2.59000
191134,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,TRIGLYCERIDES,1.96000
191135,75912433644a57c99679,FEMALE,1939-07-01,2022-09-09,HBA1C,8.10000


In [137]:
dff = filtered_labs_df[filtered_labs_df['Lab Resulted Order Test Description']=='HBA1c, blood (dual reporting)']
dff

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
124476,fa459b06e8b1aa3ee756,MALE,1995-05-01,2020-06-02,"HBA1c, blood (dual reporting)",
124674,c13d5a1581a49e20d93e,FEMALE,1979-06-01,2020-06-30,"HBA1c, blood (dual reporting)",
125612,e276bf2aa93b2b10fed7,MALE,1954-10-01,2020-11-03,"HBA1c, blood (dual reporting)",
125810,611a247fac3331c131e7,FEMALE,1939-10-01,2021-01-02,"HBA1c, blood (dual reporting)",
125923,efe80c62e33e89d79d1b,FEMALE,1994-07-01,2020-12-14,"HBA1c, blood (dual reporting)",
...,...,...,...,...,...,...
189490,36d613ca1aef17d8b2ba,MALE,1953-05-01,2022-07-30,"HBA1c, blood (dual reporting)",
189590,1636a78621c4350fa7df,FEMALE,1940-09-01,2022-07-05,"HBA1c, blood (dual reporting)",
189794,4e9fd2f4fe0c5e8b6cb7,MALE,1953-01-01,2022-07-28,"HBA1c, blood (dual reporting)",
190190,2a567a0b99b2ce479d36,MALE,1951-07-01,2022-07-20,"HBA1c, blood (dual reporting)",


# Drop rows with 'TRIGLYCERIDES,FLUID', 'TRIGLYCERIDES,URINE', 'HBA1c, blood (dual reporting)''

In [138]:
filtered_labs_df = filtered_labs_df[~filtered_labs_df['Lab Resulted Order Test Description'].isin (['TRIGLYCERIDES,FLUID' , 'TRIGLYCERIDES,URINE', 'HBA1c, blood (dual reporting)'])]
filtered_labs_df

Unnamed: 0,Patient ID,Gender,Date of Birth,Specimen Received Date,Lab Resulted Order Test Description,Result Value
0,19b1539322147af94e95,FEMALE,1926-03-01,2015-11-23,HBA1C,7.00000
1,c1926d2f0b958145121e,MALE,1965-10-01,2015-11-03,HBA1C,6.20000
2,dff6f178b1dc7a7240d9,FEMALE,1971-06-01,2015-11-27,HBA1C,6.10000
3,1c12adf8538a182f2d24,MALE,1970-01-01,2015-11-25,GLUCOSE FASTING,9.30000
4,f5e124aa01edd126ac74,MALE,1960-10-01,2015-11-24,HBA1C,10.30000
...,...,...,...,...,...,...
191132,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,HBA1C,5.90268
191133,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,LDL-CHOLESTEROL,2.59000
191134,dccd5ee362ce9d9a2638,FEMALE,1933-11-01,2022-09-08,TRIGLYCERIDES,1.96000
191135,75912433644a57c99679,FEMALE,1939-07-01,2022-09-09,HBA1C,8.10000


## Counting number of tests for within start (DM dx date) and end (Cancer dx date)

In [32]:
tests = {
    "Fasting Glucose": "glucose",
    "HbA1c": "hba1c",
    "Triglycerides": "triglycerides",
    "LDL": "ldl"
}

In [None]:
def get_DM_dx(df):
    # Determine if currently has diabetes
    is_diabetic = True
    
    # Determine if there are values to be excluded if there is a DM date within the lab values
    DM_dx_dt = df[FIELD["DATE"]].iloc[0]
    return is_diabetic, DM_dx_dt

In [None]:
def get_cancer_dx(df):
    cancers = []
    dts = []
    return cancers, dts

In [None]:
def agg_pt_details(grp):
    grp = grp.sort_values(by=FIELD["DATE"])
    # Initialise dict
    result = {"meets_criteria": True, 
              "CANCER": [],
              "CANCER Date": []}
    for key in tests:
        result.update({key: [], key + " Dates": []})
    
    # 01 Check for diabetes, return if no diabetes
    
    is_diabetic, DM_dx_dt = get_DM_dx(grp)
    if is_diabetic is False:
        result["meets_criteria"] = False
        return pd.Series(result)

    # 02 Check for cancer
    
    cancers, cancer_dts = get_cancer_dx(grp)
    earliest_cancer_dt = None
    if (len(cancer_dts) > 0):
        earliest_cancer_dt = sorted(cancer_dt)[0]
 
    # 03 Extract tests within given window
    
    date_mask = pd.Series(True, index=grp.index)
    if DM_dx_dt is not None:
        date_mask &= grp[FIELD["DATE"]] >= pd.to_datetime(DM_dx_dt)
    if earliest_cancer_dt is not None:
        date_mask &= grp[FIELD["DATE"]] >= pd.to_datetime(earliest_cancer_dt) - pd.DateOffset(years=1)
    windowed_grp = grp[date_mask]
    
    for key, val in tests.items():
        test_grp = windowed_grp[grp[FIELD["TEST_NAME"].str.contains(val, case=False, na=False)]]
        result["CANCER"] = test_grp[FIELD["RESULT"]].to_numpy()
        result["CANCER date"] = test_grp[FIELD["DATE"]].to_numpy()
        
    return pd.Series(result)

In [None]:
filtered_labs_df

In [None]:
pt_df = filtered_labs_df.groupby([FIELD["ID"]]).apply(agg_pt_details)

## Load and filter diagnosis data

In [None]:
usecols = []
diag_df = pd.DataFrame()

In [None]:
diag_df = aggregate_csvs(os.path.join(HOME_DIR, "elpha-data/diagnosis", "*.csv"), diag_df, usecols)

In [109]:
#WIDE DATA FORMAT

In [275]:
grouped_data = filtered_labs_df.groupby(['Patient ID','Gender','Date of Birth','Lab Resulted Order Test Description'])\
.apply(lambda x: list(zip(pd.to_datetime(x['Specimen Received Date']).dt.strftime('%Y-%m-%d'), x['Result Value']))) \
.reset_index(name = 'date_result')

In [276]:
wide_data = grouped_data.pivot(index=['Patient ID','Gender','Date of Birth'], columns = 'Lab Resulted Order Test Description', values = 'date_result')

In [277]:
for test in ['HBA1C', 'GLUCOSE FASTING', 'LDL-CHOLESTEROL', 'TRIGLYCERIDES']:
    wide_data[f'{test}_name']=test
    wide_data[f'{test}_results'] = wide_data[test]
    wide_data.drop(columns = [test], inplace=True)
    

In [278]:
for test in ['HBA1C', 'GLUCOSE FASTING', 'LDL-CHOLESTEROL', 'TRIGLYCERIDES']:
    wide_data[f'{test}_results'] = wide_data[f'{test}_results'].apply(
             lambda lst: sorted(lst, key=lambda x: pd.to_datetime(x[0])) if isinstance(lst, list) else lst
    )

In [279]:
wide_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Lab Resulted Order Test Description,HBA1C_name,HBA1C_results,GLUCOSE FASTING_name,GLUCOSE FASTING_results,LDL-CHOLESTEROL_name,LDL-CHOLESTEROL_results,TRIGLYCERIDES_name,TRIGLYCERIDES_results
Patient ID,Gender,Date of Birth,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00031c3262ee7a0c2981,FEMALE,1988-05-01,HBA1C,"[(2015-04-10, 4.7)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-04-10, 2.46)]",TRIGLYCERIDES,"[(2015-04-10, 0.77)]"
0005f0349ba521e1ecb8,MALE,1964-08-01,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2017-07-14, 2.33)]",TRIGLYCERIDES,"[(2017-07-14, 2.88)]"
0009747ecbb93007f1bc,MALE,1946-12-01,HBA1C,"[(2021-04-21, 5.5), (2021-04-21, 5.53676)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,
000b2a238717215130d6,FEMALE,1956-07-01,HBA1C,"[(2018-09-20, 5.5), (2021-02-14, 6.1), (2021-0...",GLUCOSE FASTING,"[(2018-09-20, 5.6)]",LDL-CHOLESTEROL,"[(2018-09-20, 3.74), (2021-02-14, 2.09)]",TRIGLYCERIDES,"[(2018-09-20, 1.18), (2021-02-14, 1.12)]"
000d121054e0f920e843,FEMALE,1941-11-01,HBA1C,"[(2019-12-13, 5.4)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,
...,...,...,...,...,...,...,...,...,...,...
fffc7834b1be0778220d,MALE,1930-12-01,HBA1C,"[(2015-03-10, 5.8)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-03-10, 1.5)]",TRIGLYCERIDES,"[(2015-03-10, 0.89)]"
fffdc69ec9e4e8d95613,FEMALE,1937-01-01,HBA1C,"[(2017-11-18, nan)]",GLUCOSE FASTING,"[(2017-11-23, 5.4)]",LDL-CHOLESTEROL,"[(2017-11-23, 2.45)]",TRIGLYCERIDES,"[(2017-11-23, 0.93)]"
ffff3d1e06e7dbc39130,FEMALE,1962-03-01,HBA1C,"[(2017-09-14, 4.9)]",GLUCOSE FASTING,"[(2017-09-15, 6.5)]",LDL-CHOLESTEROL,,TRIGLYCERIDES,
ffff617649bf7d3b0bb9,MALE,2015-09-01,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2015-09-22, 2.69)]"


# ADDING NEW DIABETIC COLUMN

In [295]:
def check_diabetic(result_list):
    for i, (_,value) in enumerate(result_list):
        if not isinstance (result_list,list):
            return 'Not Diabetic', i
        for _,value in result_list:
            try:
                if float(value) > 6.5:
                    return 'diabetic',i
            except:
                continue
        return 'Non-diabetic',i

In [296]:
wide_data['Diabetic_status', [HBA1C_date_index]] = wide_data['HBA1C_results'].apply(check_diabetic)

TypeError: 'float' object is not iterable

In [282]:
wide_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Lab Resulted Order Test Description,HBA1C_name,HBA1C_results,GLUCOSE FASTING_name,GLUCOSE FASTING_results,LDL-CHOLESTEROL_name,LDL-CHOLESTEROL_results,TRIGLYCERIDES_name,TRIGLYCERIDES_results,Diabetic_status
Patient ID,Gender,Date of Birth,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00031c3262ee7a0c2981,FEMALE,1988-05-01,HBA1C,"[(2015-04-10, 4.7)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-04-10, 2.46)]",TRIGLYCERIDES,"[(2015-04-10, 0.77)]",Non-diabetic
0005f0349ba521e1ecb8,MALE,1964-08-01,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2017-07-14, 2.33)]",TRIGLYCERIDES,"[(2017-07-14, 2.88)]",Not Diabetic
0009747ecbb93007f1bc,MALE,1946-12-01,HBA1C,"[(2021-04-21, 5.5), (2021-04-21, 5.53676)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-diabetic
000b2a238717215130d6,FEMALE,1956-07-01,HBA1C,"[(2018-09-20, 5.5), (2021-02-14, 6.1), (2021-0...",GLUCOSE FASTING,"[(2018-09-20, 5.6)]",LDL-CHOLESTEROL,"[(2018-09-20, 3.74), (2021-02-14, 2.09)]",TRIGLYCERIDES,"[(2018-09-20, 1.18), (2021-02-14, 1.12)]",Non-diabetic
000d121054e0f920e843,FEMALE,1941-11-01,HBA1C,"[(2019-12-13, 5.4)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-diabetic
...,...,...,...,...,...,...,...,...,...,...,...
fffc7834b1be0778220d,MALE,1930-12-01,HBA1C,"[(2015-03-10, 5.8)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-03-10, 1.5)]",TRIGLYCERIDES,"[(2015-03-10, 0.89)]",Non-diabetic
fffdc69ec9e4e8d95613,FEMALE,1937-01-01,HBA1C,"[(2017-11-18, nan)]",GLUCOSE FASTING,"[(2017-11-23, 5.4)]",LDL-CHOLESTEROL,"[(2017-11-23, 2.45)]",TRIGLYCERIDES,"[(2017-11-23, 0.93)]",Non-diabetic
ffff3d1e06e7dbc39130,FEMALE,1962-03-01,HBA1C,"[(2017-09-14, 4.9)]",GLUCOSE FASTING,"[(2017-09-15, 6.5)]",LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-diabetic
ffff617649bf7d3b0bb9,MALE,2015-09-01,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2015-09-22, 2.69)]",Not Diabetic


In [230]:
#MERGING WITH CANCER DATA

In [231]:
sugar_data = wide_data

In [232]:
diag_data = pd.read_pickle('/home/jovyan/elpha-data/outputs/diagnosis(2).pkl')

In [187]:
diag_data

Unnamed: 0,Institution Code,Patient ID,Date of Birth,Nationality,Race,Diagnosis Code (ICD10),Diagnosis Date
2500,SKH,c8afa51cd7d0c9417072,1935-05-01,Indonesian,Chinese,C20,2022-01-25
2581,SKH,04087333c21b6606a4d7,1943-09-01,Indonesian,Chinese,C61,2022-06-10
2592,SKH,04087333c21b6606a4d7,1943-09-01,Indonesian,Chinese,C61,2022-06-10
3154,SKH,4824ee2f8728269f7462,1944-06-01,Sri Lankan,Sri Lankan,C61,2022-02-28
8535,SKH,7dd967f403bbd4848207,1975-12-01,Chinese,Chinese,C20,2022-05-12
...,...,...,...,...,...,...,...
4014770,SKH,0e4bd6ff271ac541872c,1950-01-01,Indonesian,Indonesian,C61,2022-08-29
4014790,SKH,9bc2413fddcf457375d0,1953-09-01,Indonesian,Indonesian,C61,2022-10-12
4014791,SKH,9bc2413fddcf457375d0,1953-09-01,Indonesian,Indonesian,C61,2022-10-12
4014799,SKH,91ef7cb6afdf2733d68e,1958-12-01,Indonesian,Other Races,C61,2022-12-20


In [188]:
data_combined = pd.merge(sugar_data, diag_data[['Patient ID','Diagnosis Code (ICD10)', 'Diagnosis Date']], on='Patient ID', how='left')
data_combined

Unnamed: 0,Patient ID,HBA1C_name,HBA1C_results,GLUCOSE FASTING_name,GLUCOSE FASTING_results,LDL-CHOLESTEROL_name,LDL-CHOLESTEROL_results,TRIGLYCERIDES_name,TRIGLYCERIDES_results,Diabetic_status,Diagnosis Code (ICD10),Diagnosis Date
0,00031c3262ee7a0c2981,HBA1C,"[(2015-04-10, 4.7)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-04-10, 2.46)]",TRIGLYCERIDES,"[(2015-04-10, 0.77)]",Non-Diabetic,,
1,0005f0349ba521e1ecb8,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2017-07-14, 2.33)]",TRIGLYCERIDES,"[(2017-07-14, 2.88)]",Non-diabeic,,
2,0009747ecbb93007f1bc,HBA1C,"[(2021-04-21, 5.5), (2021-04-21, 5.53676)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-Diabetic,,
3,000b2a238717215130d6,HBA1C,"[(2018-09-20, 5.5), (2021-02-14, 6.1), (2021-0...",GLUCOSE FASTING,"[(2018-09-20, 5.6)]",LDL-CHOLESTEROL,"[(2018-09-20, 3.74), (2021-02-14, 2.09)]",TRIGLYCERIDES,"[(2018-09-20, 1.18), (2021-02-14, 1.12)]",Non-Diabetic,,
4,000d121054e0f920e843,HBA1C,"[(2019-12-13, 5.4)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-Diabetic,,
...,...,...,...,...,...,...,...,...,...,...,...,...
61903,fffc7834b1be0778220d,HBA1C,"[(2015-03-10, 5.8)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-03-10, 1.5)]",TRIGLYCERIDES,"[(2015-03-10, 0.89)]",Non-Diabetic,,
61904,fffdc69ec9e4e8d95613,HBA1C,"[(2017-11-18, nan)]",GLUCOSE FASTING,"[(2017-11-23, 5.4)]",LDL-CHOLESTEROL,"[(2017-11-23, 2.45)]",TRIGLYCERIDES,"[(2017-11-23, 0.93)]",Non-Diabetic,,
61905,ffff3d1e06e7dbc39130,HBA1C,"[(2017-09-14, 4.9)]",GLUCOSE FASTING,"[(2017-09-15, 6.5)]",LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-Diabetic,,
61906,ffff617649bf7d3b0bb9,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2015-09-22, 2.69)]",Non-diabeic,,


In [194]:
#save as a pickle file as 'combined_data'

In [193]:
data_combined.to_pickle('/home/jovyan/elpha-data/outputs/combined_data.pkl')

In [189]:
cases_data = data_combined[~data_combined['Diagnosis Code (ICD10)'].isna()].copy()
control_data = data_combined[data_combined['Diagnosis Code (ICD10)'].isna()].copy()

In [190]:
cases_data

Unnamed: 0,Patient ID,HBA1C_name,HBA1C_results,GLUCOSE FASTING_name,GLUCOSE FASTING_results,LDL-CHOLESTEROL_name,LDL-CHOLESTEROL_results,TRIGLYCERIDES_name,TRIGLYCERIDES_results,Diabetic_status,Diagnosis Code (ICD10),Diagnosis Date
109,0095f6e8f769caa9fe87,HBA1C,"[(2018-06-16, 6.3)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2018-06-16, 3.31)]",TRIGLYCERIDES,"[(2018-06-16, 2.12)]",Non-Diabetic,C61,2023-02-14
110,0095f6e8f769caa9fe87,HBA1C,"[(2018-06-16, 6.3)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2018-06-16, 3.31)]",TRIGLYCERIDES,"[(2018-06-16, 2.12)]",Non-Diabetic,C61,2022-12-29
111,0095f6e8f769caa9fe87,HBA1C,"[(2018-06-16, 6.3)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2018-06-16, 3.31)]",TRIGLYCERIDES,"[(2018-06-16, 2.12)]",Non-Diabetic,C61,2022-08-06
112,0095f6e8f769caa9fe87,HBA1C,"[(2018-06-16, 6.3)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2018-06-16, 3.31)]",TRIGLYCERIDES,"[(2018-06-16, 2.12)]",Non-Diabetic,C61,2022-08-19
113,0095f6e8f769caa9fe87,HBA1C,"[(2018-06-16, 6.3)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2018-06-16, 3.31)]",TRIGLYCERIDES,"[(2018-06-16, 2.12)]",Non-Diabetic,C61,2024-02-22
...,...,...,...,...,...,...,...,...,...,...,...,...
61680,ff0034fbb5ab7c6eaae9,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2021-07-27, 0.53), (2021-08-02, 1.57), (2021...",Non-diabeic,C20,2024-01-24
61681,ff0034fbb5ab7c6eaae9,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2021-07-27, 0.53), (2021-08-02, 1.57), (2021...",Non-diabeic,C20,2024-01-24
61682,ff0034fbb5ab7c6eaae9,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2021-07-27, 0.53), (2021-08-02, 1.57), (2021...",Non-diabeic,C20,2022-09-16
61683,ff0034fbb5ab7c6eaae9,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2021-07-27, 0.53), (2021-08-02, 1.57), (2021...",Non-diabeic,C20,2022-09-21


In [191]:
control_data

Unnamed: 0,Patient ID,HBA1C_name,HBA1C_results,GLUCOSE FASTING_name,GLUCOSE FASTING_results,LDL-CHOLESTEROL_name,LDL-CHOLESTEROL_results,TRIGLYCERIDES_name,TRIGLYCERIDES_results,Diabetic_status,Diagnosis Code (ICD10),Diagnosis Date
0,00031c3262ee7a0c2981,HBA1C,"[(2015-04-10, 4.7)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-04-10, 2.46)]",TRIGLYCERIDES,"[(2015-04-10, 0.77)]",Non-Diabetic,,
1,0005f0349ba521e1ecb8,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2017-07-14, 2.33)]",TRIGLYCERIDES,"[(2017-07-14, 2.88)]",Non-diabeic,,
2,0009747ecbb93007f1bc,HBA1C,"[(2021-04-21, 5.5), (2021-04-21, 5.53676)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-Diabetic,,
3,000b2a238717215130d6,HBA1C,"[(2018-09-20, 5.5), (2021-02-14, 6.1), (2021-0...",GLUCOSE FASTING,"[(2018-09-20, 5.6)]",LDL-CHOLESTEROL,"[(2018-09-20, 3.74), (2021-02-14, 2.09)]",TRIGLYCERIDES,"[(2018-09-20, 1.18), (2021-02-14, 1.12)]",Non-Diabetic,,
4,000d121054e0f920e843,HBA1C,"[(2019-12-13, 5.4)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-Diabetic,,
...,...,...,...,...,...,...,...,...,...,...,...,...
61903,fffc7834b1be0778220d,HBA1C,"[(2015-03-10, 5.8)]",GLUCOSE FASTING,,LDL-CHOLESTEROL,"[(2015-03-10, 1.5)]",TRIGLYCERIDES,"[(2015-03-10, 0.89)]",Non-Diabetic,,
61904,fffdc69ec9e4e8d95613,HBA1C,"[(2017-11-18, nan)]",GLUCOSE FASTING,"[(2017-11-23, 5.4)]",LDL-CHOLESTEROL,"[(2017-11-23, 2.45)]",TRIGLYCERIDES,"[(2017-11-23, 0.93)]",Non-Diabetic,,
61905,ffff3d1e06e7dbc39130,HBA1C,"[(2017-09-14, 4.9)]",GLUCOSE FASTING,"[(2017-09-15, 6.5)]",LDL-CHOLESTEROL,,TRIGLYCERIDES,,Non-Diabetic,,
61906,ffff617649bf7d3b0bb9,HBA1C,,GLUCOSE FASTING,,LDL-CHOLESTEROL,,TRIGLYCERIDES,"[(2015-09-22, 2.69)]",Non-diabeic,,


In [195]:
#check if there are sufficent test results in cases_data

In [199]:
from dateutil.relativedelta import relativedelta
from datetime import datetime

In [215]:
cases_data['Diagnosis Date'] = pd.to_datetime(cases_data['Diagnosis Date'])

def check_min_values(row, result_columns, min_count=5, years_before=1):
    cutoff = row['Diagnosis Date']- relativedelta(years=years_before)
    
    for col in result_columns:
        result_list = row[col]
        
        if not isinstance(result_list, list):
            return False
        try:
           valid_values = [val for (dt,val) in result_list if pd.to_datetime(dt) <= cutoff]
        except Exception as e:
            print(f"Error processing row: {e}")
            return False
            
        if len(valid_values) < min_count:
            return False
    return True
        

result_columns = ['HBA1C_results','GLUCOSE FASTING_results', 'LDL-CHOLESTEROL_results', 'TRIGLYCERIDES_results' ]
cases_data['meets_criteria'] = cases_data.apply(lambda row: check_min_values(row, result_columns), axis=1)

if (cases_data['meets_criteria']==True):
    print("present")

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [218]:
cases_data[cases_data['meets_criteria']==True]

Unnamed: 0,Patient ID,HBA1C_name,HBA1C_results,GLUCOSE FASTING_name,GLUCOSE FASTING_results,LDL-CHOLESTEROL_name,LDL-CHOLESTEROL_results,TRIGLYCERIDES_name,TRIGLYCERIDES_results,Diabetic_status,Diagnosis Code (ICD10),Diagnosis Date,meets_criteria
