## Import Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

## Import Tables

In [3]:
diagnoses_icd = pd.read_csv('./data/mimic-iv-0.4/hosp/diagnoses_icd.csv.gz', compression='gzip')
d_icd_diagnoses = pd.read_csv('./data/mimic-iv-0.4/hosp/d_icd_diagnoses.csv.gz', compression='gzip')
admissions = pd.read_csv('./data/mimic-iv-0.4/core/admissions.csv.gz', compression='gzip')


## Pull codes related to Congestive Heart Failure

In [4]:
chf_codes = list(map(str, d_icd_diagnoses[(d_icd_diagnoses['long_title'].str.lower().str.contains('congestive')) & (d_icd_diagnoses['long_title'].str.lower().str.contains('heart'))]['icd_code']))


## Start filtering down to only records where CHF is True

In [5]:
diagnoses_icd_chf = diagnoses_icd[diagnoses_icd['icd_code'].isin(chf_codes)]


## Generate new Admissions table with hospital stay duration and CHF flag

In [6]:
admissions_chf = admissions[['subject_id', 'hadm_id', 'admittime', 'dischtime']]

admissions_chf['CHF'] = np.where(admissions_chf['subject_id'].isin(diagnoses_icd_chf.subject_id), 1, 0)
admissions_chf['time_spent'] = pd.to_datetime(admissions_chf['dischtime']) - pd.to_datetime(admissions_chf['admittime'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## Sample only 5k patients for testing

In [9]:
admissions_chf_reduced = admissions_chf[['subject_id', 'CHF']].drop_duplicates()
admissions_chf_sample = resample(admissions_chf_reduced, n_samples = 5000, replace = False, stratify = admissions_chf_reduced['CHF'], random_state = 0)

admissions_chf[admissions_chf['subject_id'].isin(admissions_chf_sample.subject_id)].groupby('CHF').subject_id.nunique()


CHF
0    4589
1     411
Name: subject_id, dtype: int64