In [None]:
import sqlite3
import pandas as pd
import importlib
from datetime import datetime, timedelta
import numpy as np
import os
from pathlib import Path
import sys
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
tqdm.pandas()  

sys.path.append(str(Path.cwd().parents[1]))

from config.constants import UKER_DIR

%load_ext autoreload
%autoreload 2

In [None]:
def read_SQL(data_path, query, parameters):
    
    with sqlite3.connect(data_path) as conn:
        df = pd.read_sql(query, conn, params=parameters)

    return df

# Extract cohort

Extract cancer cohort

In [None]:
query = """
SELECT DISTINCT fall.pid, fall.fid
FROM icd
JOIN fall_icd ON icd.id = fall_icd.icd
JOIN fall ON fall_icd.fid = fall.fid
WHERE icd.code LIKE 'C%'
"""
query_params = []
df =read_SQL(UKER_DIR, query, query_params)

fids_cancer = df.fid.unique().tolist()
pids_cancer = df.pid.unique().tolist()

print('# Cancer Admissions',len(fids_cancer))
print('# Cancer Patients',len(pids_cancer))

### Extract chemo cohort (ICD codes)

In [None]:
query = """
SELECT DISTINCT fall.pid, fall.fid
FROM icd
JOIN fall_icd ON icd.id = fall_icd.icd
JOIN fall ON fall_icd.fid = fall.fid
WHERE icd.code LIKE 'Z51.1%'
   OR icd.code LIKE 'Z51.2%'
"""

query_params = []
df =read_SQL(UKER_DIR, query, query_params)

fids_chemo_icd = df.fid.unique().tolist()
pids_chemo_icd = df.pid.unique().tolist()


print('# Chemo icd  Admissions',len(fids_chemo_icd))
print('# Chemo icd Patients',len(pids_chemo_icd))

### Extract chemo cohort (procedures)

**OPS chemotherapy codes**

8-54: Cytostatic chemotherapy, immunotherapy and antiretroviral therapy

8-541 Instillation of and locoregional therapy with cytotoxic drugs and immunomodulators

**8-542 Non complex chemotherapy**

**8-543 Moderately complex and intensive block chemotherapy**

**8-544 Highly complex and intensive block chemotherapy**

8-546 Hyperthermic chemotherapy

8-547 Other immunotherapy

8-548 Highly active antiretroviral therapy (HAART)

8-549 Percutaneous isolated organ perfusion for chemotherapy

In [None]:
query = """
SELECT DISTINCT fall.pid, fall.fid
FROM ops
JOIN fall_ops ON ops.id = fall_ops.ops
JOIN fall ON fall_ops.fid = fall.fid
WHERE ops.code LIKE '8-542%'
   OR ops.code LIKE '8-543%'
   OR ops.code LIKE '8-544%'
"""

query_params = []
df = read_SQL(UKER_DIR, query, query_params)

fids_chemo_ops = df.fid.unique().tolist()
pids_chemo_ops = df.pid.unique().tolist()


print('# Chemo ops Admissions',len(fids_chemo_ops))
print('# Chemo ops Patients',len(pids_chemo_ops))



In [None]:
pids_chemo = set(pids_chemo_icd).union(set(pids_chemo_ops))
fids_chemo = set(fids_chemo_icd).union(set(fids_chemo_ops))

print('Total chemo admissions: ', len(fids_chemo))
print('Total chemo patients: ', len(pids_chemo))

#### Extract cancer chemo cohort

In [None]:
cancer_chemo_patients =list( set(pids_cancer) & set(pids_chemo)) 

pids = cancer_chemo_patients

placeholders = ','.join(['?'] * len(pids)) 

query = f""" 
SELECT f.*, p.sex
FROM fall f
JOIN pat p ON f.pid = p.pid
WHERE f.pid IN ({placeholders});
"""

query_params = pids

cancer_chemo_cohort = read_SQL(UKER_DIR, query, query_params)
cancer_chemo_cohort['chemo'] = cancer_chemo_cohort['fid'].isin(fids_chemo).astype(int)
print('cancer_chemo unique admissions: ',cancer_chemo_cohort.fid.nunique())
print('cancer_chemo unique pts: ',cancer_chemo_cohort.pid.nunique())

### Clean cohort to be compatible with MIMIC-IV

In [None]:
cancer_chemo_cohort['age'] = cancer_chemo_cohort['aufnahme_alter']
cancer_chemo_cohort['dod'] = 'NaN'


cancer_chemo_cohort['los'] = (cancer_chemo_cohort['entlassung_alter'] - cancer_chemo_cohort['aufnahme_alter'])
cancer_chemo_cohort['hospital_expire_flag'] = 0

new_columns= ['hadm_id','subject_id','admittime' ,'dischtime', 'gender',  'chemo' ,'age','dod','los','hospital_expire_flag']
cancer_chemo_cohort.columns = new_columns

save cancer chemo cohort

In [None]:
new_order = ['subject_id', 'hadm_id', 'admittime', 'dischtime','los','gender', 'age', 'hospital_expire_flag', 'dod', 'chemo']
cancer_chemo_cohort = cancer_chemo_cohort[new_order]

os.makedirs('../saved_data/cohorts', exist_ok=True) 
cancer_chemo_cohort.to_csv("../saved_data/cohorts/uker_cancer_chemo_cohort.csv.gz", index=False, compression='gzip')
print("[SUCCESSFULLY SAVED COHORT DATA]")

# Neutropenic fever Occurance

Choose target cohort and the number of days to look for NF occurance

In [None]:
cancer_chemo_cohort = pd.read_csv(f'../saved_data/cohorts/uker_cancer_chemo_cohort.csv.gz', compression='gzip',  header=0)
target_cohort = cancer_chemo_cohort.copy()
days = 30

Find  CURRENT admissions Neutropenic Fever based on ICD-10 codes for diagnoses

In [None]:
query = """
SELECT DISTINCT fall_icd.fid
FROM icd
JOIN fall_icd ON icd.id = fall_icd.icd
WHERE icd.code LIKE 'R50%'
"""
query_params = []
fids_R50 = read_SQL(UKER_DIR, query, query_params).fid.unique().tolist()
print('Fever fids',len(fids_R50))


query = """
SELECT DISTINCT fall_icd.fid
FROM icd
JOIN fall_icd ON icd.id = fall_icd.icd
WHERE icd.code LIKE 'D70%'
"""
query_params = []
fids_D70 = read_SQL(UKER_DIR, query, query_params).fid.unique().tolist()
print('Neutropenia fids', len(fids_D70))


fids_NF = set(fids_R50) & set(fids_D70)
print('Neitropenic fever fids',len(fids_NF))


target_cohort['NF'] = target_cohort['hadm_id'].isin(fids_NF).astype(int)

Split Negative and Positive cases for target cohort

In [None]:
def split_neutropenic_fever_cases(x, days,target_cohort,spliting_approach):
    
    if x.chemo == 0 or (x.chemo == 1 and x.NF ==1): # if there is no chemo or if both chemo and NF are present the admission is not considered!
        return 0

    if x.chemo ==1 and x.NF ==0 and x.hospital_expire_flag ==0:
        sub = target_cohort[
            (target_cohort["subject_id"] == x.subject_id) & 
            (target_cohort["hadm_id"]  != x.hadm_id) &  # avoid selecting the same admission as next where the los for admission is 0
            (target_cohort["admittime"]  >= x.dischtime) &
            #(target_cohort["admittime"] <= (x.dischtime + timedelta(days=days)))
            (target_cohort["admittime"] <= (x.dischtime + days))
        ].sort_values("admittime")  
        #remove admissions where patient died within 30 days of discharge
        #if sub.empty and x.dod <= (x.dischtime + timedelta(days=days)): 
        #if sub.empty and x.dod <= (x.dischtime + days):
        #    return 0
        #check for other chemotherapy within 30 days
        if not sub.empty and (sub["chemo"] == 1).any(): # if there is another chemo in 30 days
            positive_chemo_index = (sub["chemo"] == 1).argmax()
            readmissions_after_next_chemo = sub[positive_chemo_index:] 
            sub = sub[:positive_chemo_index]
            if ((sub["NF"] == 0).all() or sub.empty):# no NF before second chemo
                if (readmissions_after_next_chemo ["NF"] == 0).all():
                    return 1 # all readmissions after first chemo have negative NF
                else:
                    return 0 # second chemo or admissions after that have at least on positive NF

        
        # cohort 1: check only readmissions
        if spliting_approach == "only readmissions":
            if not sub.empty and (sub["NF"] == 0).all():
                return 1
            if not sub.empty and (sub["NF"] == 1).any():
                return 2
        
        #cohort 2: check both readmissions and no admissions
        if spliting_approach == "both readmissions and no admission":
            if sub.empty or (sub["NF"] == 0).all():
                return 1
            if not sub.empty and (sub["NF"]== 1).any():
                return 2
            else: 
                return 0
                
spliting_approach = "both readmissions and no admission" #"only readmissions"  OR "both readmissions and no admission"
target_cohort["NF_in_30_days"] = target_cohort.progress_apply(lambda x: split_neutropenic_fever_cases(x, 30,target_cohort,spliting_approach), axis=1)

In [None]:
pos_case = target_cohort[target_cohort["NF_in_30_days"] == 2]
neg_case= target_cohort[target_cohort["NF_in_30_days"] == 1]
none_case= target_cohort[target_cohort["NF_in_30_days"] == 0]


print('-----------------------------------')
print('positive admissions',pos_case.hadm_id.nunique())
print('-----------------------------------')
print('negative admissions',neg_case.hadm_id.nunique())
print('-----------------------------------')
print('not detemined admissions',none_case.hadm_id.nunique())
print('-----------------------------------')
print('Psitive percentage: ',pos_case.hadm_id.nunique()/(pos_case.hadm_id.nunique() + neg_case.hadm_id.nunique())*100) 

Save cohort

In [None]:
pos_case.loc[:, 'label'] = np.ones(pos_case.shape[0]).astype(int)
neg_case.loc[:, 'label'] = np.zeros(neg_case.shape[0]).astype(int)

cohort_output="uker_cohort" + "_" + "NF" + "_" + str(days) + "_days"
cohort = pd.concat([pos_case, neg_case], axis=0)

cohort = cohort.drop(columns =['hospital_expire_flag','chemo','NF','NF_in_30_days'])

cohort.to_csv("../saved_data/cohorts/"+cohort_output+".csv.gz", index=False, compression='gzip')
print("[ COHORT SUCCESSFULLY SAVED ]")

print(cohort_output)