In [1]:
# Imports
import pandas as pd
import numpy as np
import psycopg2
import tqdm
from time import gmtime, strftime

#### POSTGRESQL

In [2]:
# Notebook
host = '/tmp'

# Locus
# host = '/var/run/postgresql'

con = psycopg2.connect(dbname ='mimic', user='postgres', password="*2465RMLbru", host=host)
cur = con.cursor()

### Patients table

In [3]:
pat_query = '''SELECT * FROM mimiciii.patients;'''
mimic_patients_df = pd.read_sql_query(pat_query, con)

### Admission table

In [4]:
adm_query = '''SELECT * FROM mimiciii.admissions;'''
mimic_admission_df = pd.read_sql_query(adm_query, con)

#### Merge both tables and calculate true age

In [5]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'subject_id' , how = 'inner')

In [6]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["admittime"] = pd.to_datetime(mimic_patients_admission_df["admittime"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["dob"] = pd.to_datetime(mimic_patients_admission_df["dob"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["ages"] = mimic_patients_admission_df["admittime"].sub(mimic_patients_admission_df["dob"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df['ages'] >= 18)]

### ICD-9 Codes table

In [7]:
diagnoses_query = '''SELECT * FROM mimiciii.diagnoses_icd;'''
mimic_diagnoses_df = pd.read_sql_query(diagnoses_query, con)

In [8]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = ['subject_id', 'hadm_id'], how = 'inner')

### ICD-9 Descriptions table

In [9]:
diagnoses_descriptions_query = '''SELECT * FROM mimiciii.d_icd_diagnoses;'''
mimic_diagnoses_descriptions_df = pd.read_sql_query(diagnoses_descriptions_query, con)

## Occurrence and Mortality

### By ethnicity

In [10]:
# function by wboeg
def normalize_ethnicity(ethnicity):
    if 'BLACK' in ethnicity:
        return 'BLACK'
    if 'WHITE' in ethnicity:
        return 'WHITE'
    if 'ASIAN' in ethnicity:
        return 'ASIAN'
    if 'LATINO' in ethnicity:
        return 'LATINO'
    if 'HISPANIC' in ethnicity:
        return 'LATINO'
    if 'SOUTH AMERICAN' in ethnicity:
        return 'LATINO'
    if 'CARIBBEAN ISLAND' in ethnicity:
        return 'OTHER'
    if 'AMERICAN INDIAN' in ethnicity:
        return 'OTHER'    
    if 'MIDDLE EASTERN' in ethnicity:
        return 'OTHER'
    if 'MULTI RACE ETHNICITY' in ethnicity:
        return 'OTHER'
    if 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' in ethnicity:
        return 'OTHER'
    if 'PORTUGUESE' in ethnicity:
        return 'OTHER'       
    return 'UNKNOWN'

def normalize_insurance(ins):
    if ins in ['Government', 'Medicaid', 'Medicare']:
        return 'Public'
    elif ins == 'Private':
        return 'Private'
    else:
        return 'Self-Pay'

In [11]:
mimic_patients_diagnoses_df['ethnicity'] = mimic_patients_diagnoses_df['ethnicity'].apply(normalize_ethnicity)

In [12]:
# Patients who stayed more than 12 hours (Boag2018)
inds_at_least_6hrs = (mimic_patients_diagnoses_df['dischtime'] - mimic_patients_diagnoses_df['admittime']) > pd.Timedelta(hours=12)
mimic_patients_diagnoses_df = mimic_patients_diagnoses_df.loc[inds_at_least_6hrs]

los = mimic_patients_diagnoses_df['dischtime'] - mimic_patients_diagnoses_df['admittime']
mimic_patients_diagnoses_df['los'] = los.apply(lambda t:t.seconds/3600.)

#### Unique patient with multiple rows with same ICD9

In [13]:
eth_mortality_df = mimic_patients_diagnoses_df.groupby(['icd9_code', 'ethnicity', 'hospital_expire_flag']).size().unstack()
eth_mortality_df = eth_mortality_df.reset_index()
eth_mortality_df.columns.names = [None]
eth_mortality_df.columns = ['icd9_code', 'ethnicity', 'alive', 'dead']
eth_mortality_df.insert(4, 'total', '0')
eth_mortality_df = eth_mortality_df.fillna(0)
eth_mortality_df['total'] = eth_mortality_df['total'].astype(float)

# Compute alive, dead and total
for index, row in eth_mortality_df.iterrows():
    eth_mortality_df.at[index, 'total'] = row['alive'] + row['dead']
    # percentage
    if row['total'] != 0:
        eth_mortality_df.at[index, 'alive'] = (row['alive']/row['total'])*100
        eth_mortality_df.at[index, 'dead'] = (row['dead']/row['total'])*100

In [14]:
# We are not considering MULTI RACE ETHNICITY, UNKNOWN or OTHER 
eth_mortality_df = eth_mortality_df[(eth_mortality_df['ethnicity'] != 'OTHER') & (eth_mortality_df['ethnicity'] != 'UNKNOWN')]

In [15]:
# merge mortality with descriptions from each ICD
eth_mortality_df = eth_mortality_df.merge(mimic_diagnoses_descriptions_df, left_on='icd9_code', right_on='icd9_code')

#### Transplanted patients

In [16]:
# Search only for patients with transplant description
transplanted_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('transplant')].copy()

In [17]:
# 1. Mortality significance: only +1 patients dead
transplanted_patients_df = transplanted_patients_df.loc[eth_mortality_df['dead'] > 1]

# 2. Cases with black and white patients
# transplanted_patients_df = transplanted_patients_df[transplanted_patients_df['ethnicity'].isin(['WHITE', 'BLACK'])]

# 3. Remove ICD9 codes with only ONE ETHNICITY
for index, row in transplanted_patients_df.iterrows():
    rows = transplanted_patients_df.loc[transplanted_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        transplanted_patients_df.drop(rows.index, inplace=True)

In [18]:
# pd.set_option("display.max_colwidth", 500, 'display.max_rows', 100)
transplanted_patients_df = transplanted_patients_df.drop(['row_id', 'short_title'], axis=1)

#### Note Events

In [19]:
# Read Notes
# print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

# notes_query = 'select distinct n.subject_id, n.hadm_id, n.category, n.text, n.chartdate, n.charttime \
# from mimiciii.noteevents n  \
# where iserror IS NULL and \
# hadm_id in (' + ','.join(map(str, hadm_ids_list)) + ')'

# notes = pd.read_sql_query(notes_query, con)
# print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

In [20]:
# When patient received the transplant ?
# words = ['surgical intensive care unit', 'surgical', 'operating room', 'operative', 'procedure']
# transplant_ids = set()
# for hadm_id, rows in tqdm.tqdm(notes.groupby('hadm_id')):
#     for text in rows.text.values:
#         if any(w in text.lower() for w in words):
#             transplant_ids.add(hadm_id)

#### Cancer

In [21]:
searchfor = ['neoplasm', 'neoplasms', 'sarcoma', 'carcinoma']
cancer_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('|'.join(searchfor))].copy()

In [22]:
# 1. Mortality significance: only +1 patients dead
cancer_patients_df = cancer_patients_df.loc[cancer_patients_df['dead'] > 1]

# 2. Remove ICD9 codes with only ONE ETHNICITY
for index, row in cancer_patients_df.iterrows():
    rows = cancer_patients_df.loc[cancer_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        cancer_patients_df.drop(rows.index, inplace=True)

In [23]:
cancer_patients_df = cancer_patients_df.drop(['row_id', 'short_title'], axis=1)

In [24]:
# Breast cancer
# cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^174.*')]

# Prostate cancer
# cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^185.*')]

# Malignant Neoplasm Of Other And Unspecified Sites
# pd.set_option("display.max_colwidth", 500, 'display.max_rows', 200)
# cancer_patients_df[cancer_patients_df['icd9_code'].str.lower().str.contains(r'^19.*')]

#### Diabetes

In [25]:
diabetes_patients_df = eth_mortality_df[eth_mortality_df['long_title'].str.lower().str.contains('diabetes')].copy()

In [26]:
# 1. Mortality significance: only +1 patients dead
diabetes_patients_df = diabetes_patients_df.loc[diabetes_patients_df['dead'] > 1]

# 2. Remove ICD9 codes with only ONE ETHNICITY
for index, row in diabetes_patients_df.iterrows():
    rows = diabetes_patients_df.loc[diabetes_patients_df['icd9_code'] == row['icd9_code']]
    if (len(rows) == 1):
        diabetes_patients_df.drop(rows.index, inplace=True)

In [27]:
diabetes_patients_df = diabetes_patients_df.drop(['row_id', 'short_title'], axis=1)

### Cohort

##### HOSPITAL ADMISSIONS FOR EACH ICD9

In [28]:
icd9_list = set(transplanted_patients_df['icd9_code'])
icd9_list.update(cancer_patients_df['icd9_code'])
icd9_list.update(diabetes_patients_df['icd9_code'])

hadm_ids_list = []
cohort_df = pd.DataFrame()

for icd9 in icd9_list:
    patients = mimic_patients_diagnoses_df.loc[(mimic_patients_diagnoses_df["icd9_code"] == icd9)].copy()
    cohort_df = cohort_df.append(patients, ignore_index=True)
    for hadm_id in patients['hadm_id']:
        hadm_ids_list.append(hadm_id)

In [29]:
cohort_df = cohort_df[(cohort_df['ethnicity'] != 'OTHER') & (cohort_df['ethnicity'] != 'UNKNOWN')]

In [30]:
cohort_df['ethnicity'] = cohort_df['ethnicity'].apply(normalize_insurance)

### MIMIC III Population characteristics

In [32]:
# Table One
from tableone import TableOne

# optionally, a categorical variable for stratification
groupby = ['ethnicity']

# columns to be summarized
columns = ['gender', 'insurance','los', 'ages'] 

# columns containing categorical variables
categorical = ['gender', 'insurance']

# non-normal variables
nonnormal = ['ages', 'los']

# alternative labels
labels={'los': 'Length of Stay', 'age': 'Age', 'ethnicity':'Ethnicity',
        'gender':'Gender', 'insurance':'Insurance'}

# combine all information
grouped_df = cohort_df

# create an instance of TableOne with the input arguments
grouped_table = TableOne(grouped_df, columns, categorical, groupby, nonnormal, rename=labels, isnull=False)

# view tableone
grouped_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Ethnicity
Unnamed: 0_level_1,Unnamed: 1_level_1,Self-Pay
variable,level,Unnamed: 2_level_2
n,,26056
Gender,F,11524 (44.2)
Gender,M,14532 (55.8)
Insurance,Government,406 (1.6)
Insurance,Medicaid,2000 (7.7)
Insurance,Medicare,16167 (62.0)
Insurance,Private,7389 (28.4)
Insurance,Self Pay,94 (0.4)
Length of Stay,,"13.8 [5.4,19.6]"
ages,,"66.8 [57.3,76.2]"


In [33]:
# TODO
# load oasis severity scores
# oasis_query = 'select hadm_id,max(oasis) as oasis from mimiciii.oasis group by hadm_id'
# oasis_df = pd.read_sql_query(oasis_query, con)
# severities = dict(oasis_df[['hadm_id','oasis']].values)

### Classifiers

In [34]:
# TODO