In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
path = "D:\\Bootcamp\\MIMIC IV\\"
admissions = pd.read_csv(path + "core\\admissions.csv.gz", compression='gzip')
patients = pd.read_csv(path + "core\\patients.csv.gz", compression='gzip')
transfers = pd.read_csv(path + "core\\transfers.csv.gz", compression='gzip')

diagnoses = pd.read_csv(path + "hosp\\diagnoses_icd.csv.gz", compression='gzip')
d_diagnoses = pd.read_csv(path + "hosp\\d_icd_diagnoses.csv.gz", compression='gzip')
drg_codes = pd.read_csv(path + "hosp\\drgcodes.csv.gz", compression='gzip')

In [58]:
drg_codes = pd.read_csv(path + "hosp\\drgcodes.csv.gz", compression='gzip')

# Predict Future Diagnoses

Feasbility study to see if we can predict whether a patient will develop a disease (initially CHF) from their patient history leading up to (but not including) their first diagnosis. 

### Outline

1. Identify CHF patients
1. Look at patient histories
1. Look at co-morbidities
1. Identify first CHF diagnosis
1. Identify visit right before CHF diagnosis (< 1 mo)
1. Look for features to predict pre-CHF state (lab tests in history)

In [3]:
sick_codes = d_diagnoses[(d_diagnoses['long_title'].str.lower().str.contains('congestive',)) & (d_diagnoses['long_title'].str.lower().str.contains('heart'))]
sick_codes

Unnamed: 0,icd_code,icd_version,long_title
4620,39891,9,Rheumatic heart failure (congestive)
4793,4280,9,"Congestive heart failure, unspecified"
22226,I5020,10,Unspecified systolic (congestive) heart failure
22227,I5021,10,Acute systolic (congestive) heart failure
22228,I5022,10,Chronic systolic (congestive) heart failure
22229,I5023,10,Acute on chronic systolic (congestive) heart f...
22230,I5030,10,Unspecified diastolic (congestive) heart failure
22231,I5031,10,Acute diastolic (congestive) heart failure
22232,I5032,10,Chronic diastolic (congestive) heart failure
22233,I5033,10,Acute on chronic diastolic (congestive) heart ...


In [81]:
sick_diagnoses = diagnoses[diagnoses['icd_code'].isin(sick_codes.icd_code)]
admissions['sick_subj'] = np.where(admissions.subject_id.isin(sick_diagnoses.subject_id), 1, 0)
admissions['sick_adm'] = np.where(admissions.hadm_id.isin(sick_diagnoses.hadm_id), 1, 0)

admissions['admittime'] = pd.to_datetime(admissions.admittime)
admissions = admissions.sort_values('admittime')
admissions['dischtime'] = pd.to_datetime(admissions.dischtime)
admissions['hadm_time'] = admissions.dischtime - admissions.admittime

admissions = admissions.rename(columns={'hospital_expire_flag':'expire_flag'})

In [82]:
primary_drg = drg_codes.drop('drg_code', axis=1).rename(columns={'description':'drg_code'})
primary_drg = primary_drg[['subject_id', 'hadm_id', 'drg_code']].drop_duplicates()
primary_drg = primary_drg.groupby('hadm_id').first().reset_index()
primary_drg.head()

Unnamed: 0,hadm_id,subject_id,drg_code
0,20000019,10467237,SEPTICEMIA OR SEVERE SEPSIS W/O MV 96+ HOURS W...
1,20000024,16925328,RED BLOOD CELL DISORDERS W/O MCC
2,20000041,18910522,Knee Joint Replacement
3,20000055,11868001,"Neonate, Bwt > 2499g, Normal Newborn Or Neonat..."
4,20000069,14546051,Vaginal Delivery


In [83]:
primary_icd = diagnoses[diagnoses['seq_num']==1].merge(d_diagnoses, how='left', on=['icd_version', 'icd_code'])
primary_icd = primary_icd.drop('icd_code', axis=1).rename(columns={'long_title':'icd_code'})
primary_icd = primary_icd[['subject_id', 'hadm_id', 'icd_code']].drop_duplicates()
primary_icd = primary_icd.groupby('hadm_id').first().reset_index()
primary_icd.head()

Unnamed: 0,hadm_id,subject_id,icd_code
0,20000019,10467237,Unspecified septicemia
1,20000024,16925328,Iron deficiency anemia secondary to blood loss...
2,20000034,19430048,Obstruction of bile duct
3,20000041,18910522,"Osteoarthrosis, localized, not specified wheth..."
4,20000055,11868001,"Single liveborn, born in hospital, delivered b..."


In [84]:
admissions = admissions.merge(primary_drg, how='left').merge(primary_icd, how='left')
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,expire_flag,sick_subj,sick_adm,hadm_time,drg_code,icd_code
0,16904137,21081215,2105-10-04 17:26:00,2105-10-12 11:11:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,MARRIED,OTHER,,,0,0,0,7 days 17:45:00,,
1,12024697,20302177,2109-12-14 22:50:00,2110-01-15 14:53:00,,EW EMER.,EMERGENCY ROOM,REHAB,Other,ENGLISH,MARRIED,WHITE,2109-12-14 19:31:00,2109-12-15 01:56:00,0,0,0,31 days 16:03:00,,
2,13308789,22079847,2110-01-11 00:57:00,2110-01-13 12:45:00,,ELECTIVE,,HOME,Other,ENGLISH,,BLACK/AFRICAN AMERICAN,,,0,0,0,2 days 11:48:00,"Neonate, Bwt > 2499g, Normal Newborn Or Neonat...","Single liveborn, born in hospital, delivered w..."
3,15350437,20383396,2110-01-11 08:02:00,2110-01-12 18:45:00,,EU OBSERVATION,TRANSFER FROM HOSPITAL,,Other,ENGLISH,SINGLE,WHITE,2110-01-11 03:43:00,2110-01-11 08:41:00,0,0,0,1 days 10:43:00,,"Sixth [abducent] nerve palsy, bilateral"
4,14742657,29081685,2110-01-11 09:53:00,2110-01-13 13:00:00,,ELECTIVE,,HOME,Other,ENGLISH,,UNABLE TO OBTAIN,,,0,0,0,2 days 03:07:00,,


In [85]:
admissions.shape

(524520, 20)

# Move to EDA Notebook >>>>>>>>>>>>>>

In [25]:
admissions.groupby('ethnicity').agg({'sick_subj':['count', 'mean']})

Unnamed: 0_level_0,sick_subj,sick_subj
Unnamed: 0_level_1,count,mean
ethnicity,Unnamed: 1_level_2,Unnamed: 2_level_2
AMERICAN INDIAN/ALASKA NATIVE,1536,0.152995
ASIAN,24522,0.08421
BLACK/AFRICAN AMERICAN,80526,0.220997
HISPANIC/LATINO,29887,0.16452
OTHER,26844,0.122634
UNABLE TO OBTAIN,3742,0.068145
UNKNOWN,19419,0.099078
WHITE,338044,0.196679


In [16]:
sick_index = admissions.groupby('subject_id').agg(sick_subj=('sick_subj', 'max'))
print(np.sum(sick_index))
print(np.mean(sick_index))

sick_subj    21137
dtype: int64
sick_subj    0.082128
dtype: float64


In [101]:
result = admissions.groupby('sick_subj').agg(total_adm=('hadm_id','count'),
                                             adm_fraction=('hadm_id', lambda x: len(x)/524520),
                                             death_count=('expire_flag', 'sum'),
                                             death_fraction=('expire_flag', 'mean'))
result['chance_of_death'] = result.death_fraction/result.adm_fraction
# ^^^ DOUBLE CHECK THIS LINE FOR BAYSIAN PROBABILITY ^^^
result

Unnamed: 0_level_0,total_adm,adm_fraction,death_count,death_fraction,chance_of_death
sick_subj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,427550,0.815126,6592,0.015418,0.018915
1,96970,0.184874,2777,0.028638,0.154904


# <<<<<<<<<<<<<<<<<

In [103]:
rows = ['subject_id', 'admittime', 'sick_adm', 'expire_flag', 'drg_code', 'icd_code']
sample_adm = admissions[admissions.sick_subj==1][rows]
sample_adm['admittime'] = sample_adm.admittime.dt.date
for subj in sample_adm.sample(100).subject_id.unique():
    print('Subject: ', subj)
    print('-'*80)
    print(sample_adm[sample_adm.subject_id==subj][rows], '\n')

Subject:  14410684
--------------------------------------------------------------------------------
       subject_id   admittime  sick_adm  expire_flag  \
25075    14410684  2116-04-16         0            0   
25313    14410684  2116-05-04         0            0   
32731    14410684  2117-11-21         1            0   

                                                drg_code  \
25075  Coronary Bypass w/o Cardiac Cath Or Percutaneo...   
25313  Other Respiratory Diagnosis Except Signs, Symp...   
32731                              G.I. HEMORRHAGE W MCC   

                                                icd_code  
25075  Subendocardial infarction, initial episode of ...  
25313                       Unspecified pleural effusion  
32731  Chronic or unspecified duodenal ulcer with hem...   

Subject:  11148580
--------------------------------------------------------------------------------
        subject_id   admittime  sick_adm  expire_flag  \
76159     11148580  2125-11-26         

307016          Hepatic failure, unspecified without coma   

Subject:  19715614
--------------------------------------------------------------------------------
        subject_id   admittime  sick_adm  expire_flag  \
156481    19715614  2138-05-22         1            0   
160430    19715614  2138-12-25         0            0   

                                                 drg_code  \
156481  CARDIAC VALVE & OTH MAJ CARDIOTHORACIC PROC W/...   
160430  OTHER MUSCULOSKELET SYS & CONN TISS O.R. PROC ...   

                                                 icd_code  
156481           Congenital insufficiency of aortic valve  
160430  Spinal stenosis, lumbar region with neurogenic...   

Subject:  12852721
--------------------------------------------------------------------------------
       subject_id   admittime  sick_adm  expire_flag  \
25786    12852721  2116-06-10         0            0   
26059    12852721  2116-07-01         0            0   
35684    12852721  2118-07-04   

        subject_id   admittime  sick_adm  expire_flag  \
454267    14437309  2183-10-26         0            0   
454545    14437309  2183-11-11         0            0   
454893    14437309  2183-12-01         0            0   
455181    14437309  2183-12-16         0            0   
455581    14437309  2184-01-07         0            0   
455728    14437309  2184-01-14         0            0   
455964    14437309  2184-01-27         0            0   
456173    14437309  2184-02-07         0            0   
456336    14437309  2184-02-17         0            0   
456474    14437309  2184-02-24         0            0   
456547    14437309  2184-02-28         0            0   
457966    14437309  2184-05-20         0            0   
461638    14437309  2184-12-22         0            0   
461821    14437309  2185-01-01         0            0   
461945    14437309  2185-01-08         0            0   
462172    14437309  2185-01-20         0            0   
462693    14437309  2185-02-18 

       subject_id   admittime  sick_adm  expire_flag  \
49219    13804408  2121-03-07         0            0   
51175    13804408  2121-07-14         1            0   
66526    13804408  2124-04-24         1            0   

                                       drg_code  \
49219                        CELLULITIS W/O MCC   
51175               Acute Myocardial Infarction   
66526  Head Trauma w/ Coma > 1 Hr or Hemorrhage   

                                                icd_code  
49219         Cellulitis and abscess of leg, except foot  
51175  Subendocardial infarction, initial episode of ...  
66526  Subarachnoid hemorrhage following injury witho...   

Subject:  11648387
--------------------------------------------------------------------------------
        subject_id   admittime  sick_adm  expire_flag  \
400761    11648387  2175-09-03         0            0   
401464    11648387  2175-10-09         0            0   
417476    11648387  2178-03-09         0            0   
4187

        subject_id   admittime  sick_adm  expire_flag  \
465102    11382334  2185-07-01         0            0   
475493    11382334  2187-01-27         1            0   

                                                 drg_code  \
465102                   Extracranial Vascular Procedures   
475493  INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION...   

                                                 icd_code  
465102  Occlusion and stenosis of carotid artery witho...  
475493  Cerebral artery occlusion, unspecified with ce...   

Subject:  18095144
--------------------------------------------------------------------------------
        subject_id   admittime  sick_adm  expire_flag  \
358117    18095144  2169-02-23         1            0   
375282    18095144  2171-10-18         1            0   

                                                 drg_code  \
358117  Cardiac Valve Procedures w/ Cardiac Catheteriz...   
375282  Other Anemia & Disorders Of Blood & Blood Form...   

      

In [117]:
#find patients who had at least visit before CHF

subjects = admissions[admissions.sick_subj==1].groupby('subject_id').agg({'sick_adm':'mean'}).reset_index()
subjects = subjects[subjects.sick_adm < 1].subject_id.reset_index(drop=True)
print('There are', admissions[admissions.sick_subj==1].subject_id.nunique(), 'subjects with CHF, and',
      len(subjects), 'subjects with CHF that had at least one visit before CHF diagnosis.')

There are 21137 subjects with CHF, and 11678 subjects with CHF that had at least one visit before CHF diagnosis.
