In [1]:
# Imports
import pandas as pd
import numpy as np
import psycopg2
import tqdm
from time import gmtime, strftime

#### POSTGRESQL

In [2]:
# Notebook
host = '/tmp'

# Locus
# host = '/var/run/postgresql'

con = psycopg2.connect(dbname ='mimic', user='postgres', password="*2465RMLbru", host=host)
cur = con.cursor()

### Patients table

In [3]:
pat_query = '''SELECT * FROM mimiciii.patients;'''
mimic_patients_df = pd.read_sql_query(pat_query, con)

### Admission table

In [4]:
adm_query = '''SELECT * FROM mimiciii.admissions;'''
mimic_admission_df = pd.read_sql_query(adm_query, con)

#### Merge both tables and calculate true age

In [5]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'subject_id' , how = 'inner')

In [6]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["admittime"] = pd.to_datetime(mimic_patients_admission_df["admittime"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["dob"] = pd.to_datetime(mimic_patients_admission_df["dob"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["ages"] = mimic_patients_admission_df["admittime"].sub(mimic_patients_admission_df["dob"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df['ages'] >= 18)]

### ICD-9 Codes table

In [7]:
diagnoses_query = '''SELECT * FROM mimiciii.diagnoses_icd;'''
mimic_diagnoses_df = pd.read_sql_query(diagnoses_query, con)

In [8]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = ['subject_id', 'hadm_id'], how = 'inner')

### ICD-9 Descriptions table

In [9]:
diagnoses_descriptions_query = '''SELECT * FROM mimiciii.d_icd_diagnoses;'''
mimic_diagnoses_descriptions_df = pd.read_sql_query(diagnoses_descriptions_query, con)

## Occurrence and Mortality

In [10]:
# function by wboeg
def normalize_ethnicity(ethnicity):
    if 'BLACK' in ethnicity:
        return 'BLACK'
    if 'WHITE' in ethnicity:
        return 'WHITE'
    if 'ASIAN' in ethnicity:
        return 'ASIAN'
    if 'LATINO' in ethnicity:
        return 'LATINO'
    if 'HISPANIC' in ethnicity:
        return 'LATINO'
    if 'SOUTH AMERICAN' in ethnicity:
        return 'LATINO'
    if 'CARIBBEAN ISLAND' in ethnicity:
        return 'OTHER'
    if 'AMERICAN INDIAN' in ethnicity:
        return 'OTHER'    
    if 'MIDDLE EASTERN' in ethnicity:
        return 'OTHER'
    if 'MULTI RACE ETHNICITY' in ethnicity:
        return 'OTHER'
    if 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' in ethnicity:
        return 'OTHER'
    if 'PORTUGUESE' in ethnicity:
        return 'OTHER'       
    return 'UNKNOWN'

def normalize_insurance(ins):
    if ins in ['Government', 'Medicaid', 'Medicare']:
        return 'Public'
    elif ins == 'Private':
        return 'Private'
    else:
        return 'Self-Pay'

In [11]:
mimic_patients_diagnoses_df['ethnicity'] = mimic_patients_diagnoses_df['ethnicity'].apply(normalize_ethnicity)
mimic_patients_diagnoses_df['insurance'] = mimic_patients_diagnoses_df['insurance'].apply(normalize_insurance)

In [12]:
# Patients who stayed more than 12 hours (Boag2018)
inds_at_least_12hrs = (mimic_patients_diagnoses_df['dischtime'] - mimic_patients_diagnoses_df['admittime']) > pd.Timedelta(hours=12)
selected_mimic_patients_diagnoses_df = mimic_patients_diagnoses_df.loc[inds_at_least_12hrs].copy()

los = selected_mimic_patients_diagnoses_df['dischtime'] - selected_mimic_patients_diagnoses_df['admittime']
selected_mimic_patients_diagnoses_df['los'] = los.apply(lambda t:t.seconds/3600.)

#### Unique patient with multiple rows with same ICD9

In [31]:
eth_mortality_df = selected_mimic_patients_diagnoses_df.groupby(['icd9_code', 'ethnicity', 'hospital_expire_flag']).size().unstack()
eth_mortality_df = eth_mortality_df.reset_index()
eth_mortality_df.columns.names = [None]
eth_mortality_df.columns = ['icd9_code', 'ethnicity', 'alive', 'dead']
eth_mortality_df.insert(4, 'total', '0')
eth_mortality_df = eth_mortality_df.fillna(0)
eth_mortality_df['total'] = eth_mortality_df['total'].astype(float)

# Compute alive, dead and total
for index, row in eth_mortality_df.iterrows():
    eth_mortality_df.at[index, 'total'] = row['alive'] + row['dead']
    # percentage
    # if eth_mortality_df.at[index,'total'] > 0:
    #    eth_mortality_df.at[index, 'alive'] = (row['alive']/eth_mortality_df.at[index,'total'])*100
    #    eth_mortality_df.at[index, 'dead'] = (row['dead']/eth_mortality_df.at[index,'total'])*100

In [32]:
# We are not considering MULTI RACE ETHNICITY, UNKNOWN or OTHER 
eth_mortality_df = eth_mortality_df[(eth_mortality_df['ethnicity'] != 'OTHER') & (eth_mortality_df['ethnicity'] != 'UNKNOWN')]

In [33]:
# merge mortality with descriptions from each ICD
eth_mortality_df = eth_mortality_df.merge(mimic_diagnoses_descriptions_df, left_on='icd9_code', right_on='icd9_code')

#### Transplanted patients

In [34]:
# Search only for patients with transplant description
transplanted_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('transplant')].copy()

In [35]:
# 1. Mortality significance: only +1 patients dead
transplanted_patients_df = transplanted_patients_df.loc[eth_mortality_df['dead'] > 1]

# 3. Remove ICD9 codes with only ONE ETHNICITY
for index, row in transplanted_patients_df.iterrows():
    rows = transplanted_patients_df.loc[transplanted_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        transplanted_patients_df.drop(rows.index, inplace=True)

In [36]:
# pd.set_option("display.max_colwidth", 500, 'display.max_rows', 100)
transplanted_patients_df = transplanted_patients_df.drop(['row_id', 'short_title'], axis=1)

#### Cancer

In [37]:
searchfor = ['neoplasm', 'neoplasms', 'sarcoma', 'carcinoma']
cancer_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('|'.join(searchfor))].copy()

In [38]:
# 1. Mortality significance: only +1 patients dead
cancer_patients_df = cancer_patients_df.loc[cancer_patients_df['dead'] > 1]

# 2. Remove ICD9 codes with only ONE ETHNICITY
for index, row in cancer_patients_df.iterrows():
    rows = cancer_patients_df.loc[cancer_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        cancer_patients_df.drop(rows.index, inplace=True)

In [39]:
cancer_patients_df = cancer_patients_df.drop(['row_id', 'short_title'], axis=1)

In [40]:
# Breast cancer
# cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^174.*')]

# Prostate cancer
# cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^185.*')]

# Malignant Neoplasm Of Other And Unspecified Sites
# pd.set_option("display.max_colwidth", 500, 'display.max_rows', 200)
# cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^19.*')]

#### Diabetes

In [41]:
diabetes_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('diabetes')].copy()

In [42]:
# 1. Mortality significance: only +1 patients dead
diabetes_patients_df = diabetes_patients_df.loc[diabetes_patients_df['dead'] > 1]

# 2. Remove ICD9 codes with only ONE ETHNICITY
for index, row in diabetes_patients_df.iterrows():
    rows = diabetes_patients_df.loc[diabetes_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        diabetes_patients_df.drop(rows.index, inplace=True)

In [43]:
diabetes_patients_df = diabetes_patients_df.drop(['row_id', 'short_title'], axis=1)

### Cohort

##### HOSPITAL ADMISSIONS FOR EACH ICD9

In [59]:
icd9_list = set(transplanted_patients_df['icd9_code'])
icd9_list.update(cancer_patients_df['icd9_code'])
icd9_list.update(diabetes_patients_df['icd9_code'])

hadm_ids_list = set()
cohort_df = pd.DataFrame()

for icd9 in icd9_list:
    patients = selected_mimic_patients_diagnoses_df.loc[(selected_mimic_patients_diagnoses_df["icd9_code"] == icd9)].copy()
    cohort_df = cohort_df.append(patients, ignore_index=True)
    for hadm_id in patients['hadm_id']:
        hadm_ids_list.add(hadm_id)

In [60]:
cohort_df = cohort_df[(cohort_df['ethnicity'] != 'OTHER') & (cohort_df['ethnicity'] != 'UNKNOWN')]

### MIMIC III Population characteristics

In [61]:
# Pollard TJ, Johnson AEW, Raffa JD, Mark RG (2018). tableone: An open source
# Python package for producing summary statistics for research papers.
# JAMIA Open, Volume 1, Issue 1, 1 July 2018, Pages 26-31.
# https://doi.org/10.1093/jamiaopen/ooy012
    
# Table One
from tableone import TableOne

# optionally, a categorical variable for stratification
groupby = ['ethnicity']

# columns to be summarized
columns = ['gender', 'insurance','los', 'ages'] 

# columns containing categorical variables
categorical = ['gender', 'insurance']

# non-normal variables
nonnormal = ['ages', 'los']

# alternative labels
labels={'los': 'Length of Stay', 'age': 'Age', 'ethnicity':'Ethnicity',
        'gender':'Gender', 'insurance':'Insurance'}

# combine all information
grouped_df = cohort_df

# create an instance of TableOne with the input arguments
grouped_table = TableOne(grouped_df, columns, categorical, groupby, nonnormal, rename=labels, isnull=False)

# view tableone
grouped_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Ethnicity,Grouped by Ethnicity,Grouped by Ethnicity,Grouped by Ethnicity
Unnamed: 0_level_1,Unnamed: 1_level_1,ASIAN,BLACK,LATINO,WHITE
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
n,,789,3596,1091,20580
Gender,F,341 (43.2),1967 (54.7),442 (40.5),8774 (42.6)
Gender,M,448 (56.8),1629 (45.3),649 (59.5),11806 (57.4)
Insurance,Private,165 (20.9),685 (19.0),210 (19.2),6329 (30.8)
Insurance,Public,623 (79.0),2876 (80.0),871 (79.8),14203 (69.0)
Insurance,Self-Pay,1 (0.1),35 (1.0),10 (0.9),48 (0.2)
Length of Stay,,"15.3 [6.2,20.0]","14.1 [5.2,19.8]","13.3 [5.9,19.3]","13.8 [5.5,19.6]"
ages,,"63.5 [55.1,75.2]","64.0 [55.0,73.0]","60.3 [50.3,70.1]","67.7 [58.3,77.0]"


### Oasis score

-- ------------------------------------------------------------------
- Title: Oxford Acute Severity of Illness Score (OASIS)
- This query extracts the Oxford acute severity of illness score.
- This score is a measure of severity of illness for patients in the ICU.
- The score is calculated on the first day of each ICU patients' stay.
-- ------------------------------------------------------------------
- Reference for OASIS:
-    Johnson, Alistair EW, Andrew A. Kramer, and Gari D. Clifford.
-    "A new severity of illness scale using a subset of acute physiology and chronic health evaluation data elements shows comparable predictive accuracy*."
-    Critical care medicine 41, no. 7 (2013): 1711-1718.
-- ------------------------------------------------------------------
- Variables used in OASIS:
-  Heart rate, GCS, MAP, Temperature, Respiratory rate, Ventilation status (sourced from CHARTEVENTS)
-  Urine output (sourced from OUTPUTEVENTS)
-  Elective surgery (sourced from ADMISSIONS and SERVICES)
-  Pre-ICU in-hospital length of stay (sourced from ADMISSIONS and ICUSTAYS)
-  Age (sourced from PATIENTS)

In [77]:
# Get the OASIS scores
oasis_query = 'SELECT distinct hadm_id, max(oasis) as oasis FROM mimiciii.oasis GROUP BY hadm_id'
oasis = pd.read_sql_query(oasis_query, con)

In [None]:
def select(scores):
    return [scores[hadm_id] for hadm_id in hadm_ids_list]

# Low: 0-32
# Medium: 33-40
# High: 41+
selected_patients_oasis_score = select(oasis)

### Cox Regression

In [None]:
# TODO
# lifelines