In [2]:
# Imports
import pandas as pd
import numpy as np
import psycopg2
import tqdm
from time import gmtime, strftime

#### POSTGRESQL

In [5]:
# Notebook
host = '/tmp'

# Locus
# host = '/var/run/postgresql'

con = psycopg2.connect(dbname ='mimic', user='postgres', password="*2465RMLbru", host=host)
cur = con.cursor()

### Patients table

In [6]:
pat_query = '''SELECT * FROM mimiciii.patients;'''
mimic_patients_df = pd.read_sql_query(pat_query, con)

### Admission table

In [7]:
adm_query = '''SELECT * FROM mimiciii.admissions;'''
mimic_admission_df = pd.read_sql_query(adm_query, con)

#### Merge both tables and calculate true age

In [8]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'subject_id' , how = 'inner')

In [9]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["admittime"] = pd.to_datetime(mimic_patients_admission_df["admittime"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["dob"] = pd.to_datetime(mimic_patients_admission_df["dob"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["ages"] = mimic_patients_admission_df["admittime"].sub(mimic_patients_admission_df["dob"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df['ages'] >= 18)]

### ICD-9 Codes table

In [10]:
diagnoses_query = '''SELECT * FROM mimiciii.diagnoses_icd;'''
mimic_diagnoses_df = pd.read_sql_query(diagnoses_query, con)

In [11]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = ['subject_id', 'hadm_id'], how = 'inner')

### ICD-9 Descriptions table

In [12]:
diagnoses_descriptions_query = '''SELECT * FROM mimiciii.d_icd_diagnoses;'''
mimic_diagnoses_descriptions_df = pd.read_sql_query(diagnoses_descriptions_query, con)

## Occurrence and Mortality

### By ethnicity

In [13]:
# function by wboeg
def normalize_ethnicity(ethnicity):
    if 'BLACK' in ethnicity:
        return 'BLACK'
    if 'WHITE' in ethnicity:
        return 'WHITE'
    if 'ASIAN' in ethnicity:
        return 'ASIAN'
    if 'LATINO' in ethnicity:
        return 'LATINO'
    if 'HISPANIC' in ethnicity:
        return 'LATINO'
    if 'SOUTH AMERICAN' in ethnicity:
        return 'LATINO'
    if 'CARIBBEAN ISLAND' in ethnicity:
        return 'OTHER'
    if 'AMERICAN INDIAN' in ethnicity:
        return 'OTHER'    
    if 'MIDDLE EASTERN' in ethnicity:
        return 'OTHER'
    if 'MULTI RACE ETHNICITY' in ethnicity:
        return 'OTHER'
    if 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' in ethnicity:
        return 'OTHER'
    if 'PORTUGUESE' in ethnicity:
        return 'OTHER'       
    return 'UNKNOWN'

In [14]:
mimic_patients_diagnoses_df['ethnicity'] = mimic_patients_diagnoses_df['ethnicity'].apply(normalize_ethnicity)

In [16]:
# Patients who stayed more than 6 hours (Boag2018)
inds_at_least_6hrs = (mimic_patients_diagnoses_df['dischtime'] - mimic_patients_diagnoses_df['admittime']) > pd.Timedelta(hours=6)
mimic_patients_diagnoses_df = mimic_patients_diagnoses_df.loc[inds_at_least_6hrs]

los = mimic_patients_diagnoses_df['dischtime'] - mimic_patients_diagnoses_df['admittime']
mimic_patients_diagnoses_df['los'] = los.apply(lambda t:t.seconds/3600.)

#### Unique patient with multiple rows with same ICD9

In [48]:
eth_mortality_df = mimic_patients_diagnoses_df.groupby(['icd9_code', 'ethnicity', 'hospital_expire_flag']).size().unstack()
eth_mortality_df = eth_mortality_df.reset_index()
eth_mortality_df.columns.names = [None]
eth_mortality_df.columns = ['icd9_code', 'ethnicity', 'alive', 'dead']
eth_mortality_df.insert(4, 'total', 'NULL')
eth_mortality_df = eth_mortality_df.fillna(0)

# Compute alive, dead and total
for index, row in eth_mortality_df.iterrows():
    eth_mortality_df.at[index, 'total'] = row['alive'] + row['dead']

In [49]:
# We are not considering MULTI RACE ETHNICITY, UNKNOWN or OTHER 
eth_mortality_df = eth_mortality_df[(eth_mortality_df['ethnicity'] != 'OTHER') & (eth_mortality_df['ethnicity'] != 'UNKNOWN')]

In [50]:
# merge mortality with descriptions from each ICD
eth_mortality_df = eth_mortality_df.merge(mimic_diagnoses_descriptions_df, left_on='icd9_code', right_on='icd9_code')

#### Transplanted patients

In [53]:
# Search only for patients with transplant description
transplanted_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('transplant')]

In [54]:
# 1. Mortality significance: only +1 patients dead
transplanted_patients_df = transplanted_patients_df.loc[eth_mortality_df['dead'] > 1]

# 2. Cases with black and white patients
transplanted_patients_df = transplanted_patients_df[transplanted_patients_df['ethnicity'].isin(['WHITE', 'BLACK'])]

# 3. Remove ICD9 codes with only ONE ETHNICITY
for index, row in transplanted_patients_df.iterrows():
    rows = transplanted_patients_df.loc[transplanted_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        transplanted_patients_df.drop(rows.index, inplace=True)

In [55]:
transplanted_patients_df = transplanted_patients_df.drop(['row_id', 'short_title'], axis=1)

In [56]:
pd.set_option("display.max_colwidth", 500, 'display.max_rows', 100)
transplanted_patients_df

Unnamed: 0,icd9_code,ethnicity,alive,dead,total,long_title
11136,99681,BLACK,99.0,5.0,104,Complications of transplanted kidney
11138,99681,WHITE,313.0,35.0,348,Complications of transplanted kidney
11140,99682,BLACK,14.0,4.0,18,Complications of transplanted liver
11142,99682,WHITE,112.0,20.0,132,Complications of transplanted liver
11146,99685,BLACK,7.0,2.0,9,Complications of transplanted bone marrow
11148,99685,WHITE,65.0,50.0,115,Complications of transplanted bone marrow
11636,E8780,BLACK,61.0,6.0,67,"Surgical operation with transplant of whole organ causing abnormal patient reaction, or later complication, without mention of misadventure at time of operation"
11638,E8780,WHITE,262.0,29.0,291,"Surgical operation with transplant of whole organ causing abnormal patient reaction, or later complication, without mention of misadventure at time of operation"
12669,V420,BLACK,56.0,5.0,61,Kidney replaced by transplant
12671,V420,WHITE,225.0,16.0,241,Kidney replaced by transplant


##### HOSPITAL ADMISSIONS FOR EACH ICD9

In [57]:
icd9_list = set(transplanted_patients_df['icd9_code'])

icd9_hadm_dict = {}
hadm_ids_list = []

# Only white and black patients and corresponding admissions
for icd9 in icd9_list:
    hadm_ids = mimic_patients_diagnoses_df.loc[(mimic_patients_diagnoses_df["icd9_code"] == icd9)]['hadm_id']
    for hadm_id in hadm_ids:
        icd9_hadm_dict.setdefault(icd9, []).append(hadm_id)
        hadm_ids_list.append(hadm_id)

#### Note Events

In [58]:
# Read Notes
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

notes_query = 'select distinct n.subject_id, n.hadm_id, n.category, n.text, n.chartdate, n.charttime \
from mimiciii.noteevents n  \
where iserror IS NULL and \
hadm_id in (' + ','.join(map(str, hadm_ids_list)) + ')'

notes = pd.read_sql_query(notes_query, con)
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2019-11-08 18:16:55
2019-11-08 18:21:15


In [47]:
# When patient received the transplant ?
words = ['surgical intensive care unit', 'surgical', 'operating room', 'operative', 'procedure']
transplant_ids = set()
for hadm_id, rows in tqdm.tqdm(notes.groupby('hadm_id')):
    for text in rows.text.values:
        if any(w in text.lower() for w in words):
            transplant_ids.add(hadm_id)

100%|██████████| 1507/1507 [00:01<00:00, 1090.15it/s]


In [None]:
# make sure each hadm_id has only died once
assert len(eol_cohort) == len(set(eol_cohort['hadm_id'].values))
print 'eol subjects:', len(set(eol_cohort['hadm_id'].values))

#### Cancer

In [None]:
searchfor = ['neoplasm', 'neoplasms', 'sarcoma', 'carcinoma']
cancer_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('|'.join(searchfor))]

In [None]:
# 1. Mortality significance: only +1 patients dead
cancer_patients_df = cancer_patients_df.loc[cancer_patients_df['dead'] > 1]

# 2. Remove ICD9 codes with only ONE ETHNICITY
for index, row in cancer_patients_df.iterrows():
  rows = cancer_patients_df.loc[cancer_patients_df['icd9_code'] == row['icd9_code']]
  if (len(rows) == 1):
    cancer_patients_df.drop(rows.index, inplace=True)

In [None]:
cancer_patients_df = cancer_patients_df.drop(['row_id', 'short_title'], axis=1)

In [None]:
# Breast cancer
cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^174.*')]

In [None]:
#  Prostate cancer
cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^185.*')]

In [None]:
# Malignant Neoplasm Of Other And Unspecified Sites
pd.set_option("display.max_colwidth", 500, 'display.max_rows', 200)
cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^19.*')]

#### Diabetes

In [51]:
diabetes_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('diabetes')]

In [52]:
diabetes_patients_df

Unnamed: 0,icd9_code,ethnicity,alive,dead,total,row_id,short_title,long_title
1489,24900,ASIAN,1.0,0.0,1,1813,Sec DM wo cmp nt st uncn,"Secondary diabetes mellitus without mention of complication, not stated as uncontrolled, or unspecified"
1490,24900,BLACK,6.0,1.0,7,1813,Sec DM wo cmp nt st uncn,"Secondary diabetes mellitus without mention of complication, not stated as uncontrolled, or unspecified"
1491,24900,LATINO,10.0,1.0,11,1813,Sec DM wo cmp nt st uncn,"Secondary diabetes mellitus without mention of complication, not stated as uncontrolled, or unspecified"
1492,24900,WHITE,65.0,8.0,73,1813,Sec DM wo cmp nt st uncn,"Secondary diabetes mellitus without mention of complication, not stated as uncontrolled, or unspecified"
1493,24901,ASIAN,1.0,0.0,1,1814,Sec DM wo comp uncontrld,"Secondary diabetes mellitus without mention of complication, uncontrolled"
1494,24901,BLACK,1.0,0.0,1,1814,Sec DM wo comp uncontrld,"Secondary diabetes mellitus without mention of complication, uncontrolled"
1495,24901,WHITE,2.0,0.0,2,1814,Sec DM wo comp uncontrld,"Secondary diabetes mellitus without mention of complication, uncontrolled"
1496,24911,BLACK,1.0,0.0,1,1816,Sec DM ketoacd uncntrld,"Secondary diabetes mellitus with ketoacidosis, uncontrolled"
1497,24911,WHITE,1.0,0.0,1,1816,Sec DM ketoacd uncntrld,"Secondary diabetes mellitus with ketoacidosis, uncontrolled"
1498,24950,WHITE,1.0,0.0,1,1823,Sec DM ophth nt st uncn,"Secondary diabetes mellitus with ophthalmic manifestations, not stated as uncontrolled, or unspecified"
