In [0]:
# Imports
from IPython.display import Image
import pandas as pd
import numpy as np
from scipy.stats import binom_test

In [0]:
PATH = '/content/drive/My Drive/Master/UFMG/Dissertation/Exploration/'

In [3]:
# READ CSV - Method 2 (Drive)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Read tables

### Patients table

In [0]:
mimic_patients_df = pd.read_csv(PATH + 'mimic-III/PATIENTS.csv')

### Admission table

In [0]:
mimic_admission_df = pd.read_csv(PATH + 'mimic-III/ADMISSIONS.csv')

#### Merge both tables and calculate true age

In [0]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'SUBJECT_ID' , how = 'inner')

In [0]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["ADMITTIME"] = pd.to_datetime(mimic_patients_admission_df["ADMITTIME"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["DOB"] = pd.to_datetime(mimic_patients_admission_df["DOB"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["AGES"] = mimic_patients_admission_df["ADMITTIME"].sub(mimic_patients_admission_df["DOB"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df.AGES >= 18)]

### ICD-9 Codes table

In [0]:
mimic_diagnoses_df = pd.read_csv(PATH + 'mimic-III/DIAGNOSES_ICD.csv')

In [0]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = 'SUBJECT_ID' , how = 'inner')

### ICD-9 Descriptions table

In [0]:
mimic_diagnoses_descriptions_df = pd.read_csv(PATH + 'mimic-III/D_ICD_DIAGNOSES.csv')

## Occurrence and Mortality

### By ethnicity

#### Creating new table with regex


In [0]:
# mimic_eth_mortality_df = mimic_patients_diagnoses_df

# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^BLACK.*', value='BLACK', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^WHITE.*', value='WHITE', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^ASIAN.*', value='ASIAN', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^LATINO.*', value='LATINO', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^HISPANIC.*', value='LATINO', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^SOUTH AMERICAN.*', value='LATINO', regex=True)

# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^CARIBBEAN ISLAND.*', value='OTHER', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^AMERICAN INDIAN.*', value='OTHER', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^MIDDLE EASTERN.*', value='OTHER', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^MULTI RACE ETHNICITY.*', value='OTHER', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER.*', value='OTHER', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^PORTUGUESE.*', value='OTHER', regex=True)

# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^UNABLE TO OBTAIN.*', value='UNKNOWN', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^UNKNOWN.*', value='UNKNOWN', regex=True)
# mimic_eth_mortality_df = mimic_eth_mortality_df.replace(to_replace='^PATIENT DECLINED TO ANSWER.*', value='UNKNOWN', regex=True)

# mimic_eth_mortality_df.to_csv('ETHNICITY_MORTALITY.csv', index = None, header=True)
# !cp ETHNICITY_MORTALITY.csv PATH+'mimic-III/'

#### Read table

In [0]:
mimic_eth_mortality_df = pd.read_csv(PATH + 'mimic-III/ETHNICITY_MORTALITY.csv')

In [0]:
eth_mortality_df = mimic_eth_mortality_df.groupby(['ICD9_CODE', 'ETHNICITY', 'HOSPITAL_EXPIRE_FLAG']).size().unstack()
eth_mortality_df = eth_mortality_df.reset_index()
eth_mortality_df.columns.names = [None]
eth_mortality_df.columns = ['ICD9_CODE', 'ETHNICITY', 'ALIVE', 'DEAD']
eth_mortality_df.insert(4, 'TOTAL', 'NULL')
eth_mortality_df = eth_mortality_df.fillna(0)

# Compute alive, dead and total
for index, row in eth_mortality_df.iterrows():
  eth_mortality_df.at[index, 'TOTAL'] = row.ALIVE + row.DEAD

#### Data integrity

In [14]:
# missing codes
print(len(mimic_eth_mortality_df[mimic_eth_mortality_df['ICD9_CODE'].isnull()]))
# total patients from original table
print(len(mimic_patients_diagnoses_df))
# total patients from new table
print(eth_mortality_df["TOTAL"].sum())

97
1259917
1259820.0


In [15]:
# All patients
eth_mortality_group = eth_mortality_df.groupby('ETHNICITY')['TOTAL'].sum()
print(eth_mortality_group.sum())
print(eth_mortality_group)

1259820.0
ETHNICITY
ASIAN       26886.0
BLACK      254918.0
LATINO      49482.0
OTHER       34048.0
UNKNOWN     62554.0
WHITE      831932.0
Name: TOTAL, dtype: float64


In [0]:
eth_mortality_df = eth_mortality_df[(eth_mortality_df.ETHNICITY != 'OTHER') & (eth_mortality_df.ETHNICITY != 'UNKNOWN')]

In [0]:
# 1. Mortality significance: only +2 patients dead
eth_mortality_significance_df = eth_mortality_df.loc[eth_mortality_df.DEAD > 2].copy()

# 2. Remove ICD9 codes with only ONE ETHNICITY
for index, row in eth_mortality_significance_df.iterrows():
  rows = eth_mortality_significance_df.loc[eth_mortality_significance_df.ICD9_CODE == row.ICD9_CODE]
  if (len(rows) == 1):
    eth_mortality_significance_df.drop(rows.index, inplace=True)

#### Binomial test

In [0]:
# 3. Binomial test

icd9_code, ethnicity, p_value_occ, p_value_mort = ([] for i in range(4))
asian_t, black_t, latino_t, white_t = ([] for i in range(4))
asian_d, black_d, latino_d, white_d = ([] for i in range(4))
icd_pass = True
 
for index, row in eth_mortality_significance_df.iterrows():
  if row.ICD9_CODE not in icd9_code: # icd must be evaluate only once
    icd_pass = True

  if icd_pass:
    rows = eth_mortality_significance_df[eth_mortality_significance_df["ICD9_CODE"] == row.ICD9_CODE]
    icd_pass = False
    
    asian_total, black_total, latino_total, white_total = 0, 0, 0, 0
    asian_dead, black_dead, latino_dead, white_dead = 0, 0, 0, 0
    asian_percentage_dead, black_percentage_dead, latino_percentage_dead, white_percentage_dead = 0, 0, 0, 0
    
    if rows.loc[rows.ETHNICITY == "ASIAN"]["TOTAL"].any():
      asian_total = rows.loc[rows.ETHNICITY == "ASIAN"]["TOTAL"].item()
      asian_dead = rows.loc[rows.ETHNICITY == "ASIAN"]["DEAD"].item()
      asian_percentage_dead = asian_dead / asian_total
      
    if rows.loc[rows.ETHNICITY == "BLACK"]["TOTAL"].any():
      black_total = rows.loc[rows.ETHNICITY == "BLACK"]["TOTAL"].item()
      black_dead = rows.loc[rows.ETHNICITY == "BLACK"]["DEAD"].item()
      black_percentage_dead = black_dead / black_total
      
    if rows.loc[rows.ETHNICITY == "LATINO"]["TOTAL"].any():
      latino_total = rows.loc[rows.ETHNICITY == "LATINO"]["TOTAL"].item()
      latino_dead = rows.loc[rows.ETHNICITY == "LATINO"]["DEAD"].item()
      latino_percentage_dead = latino_dead / latino_total
      
    if rows.loc[rows.ETHNICITY == "WHITE"]["TOTAL"].any():
      white_total = rows.loc[rows.ETHNICITY == "WHITE"]["TOTAL"].item()
      white_dead = rows.loc[rows.ETHNICITY == "WHITE"]["DEAD"].item()
      white_percentage_dead = white_dead / white_total
      
    asian_percentage_pop = asian_total/eth_mortality_group.ASIAN
    black_percentage_pop = black_total/eth_mortality_group.BLACK
    latino_percentage_pop = latino_total/eth_mortality_group.LATINO
    white_percentage_pop = white_total/eth_mortality_group.WHITE
    
    # If less common in asian/black/latino/white (considering total population) but they die more
    if ((asian_total != 0 and asian_dead != 0 and (asian_percentage_pop < (black_percentage_pop + latino_percentage_pop + white_percentage_pop)) and (asian_percentage_dead > (black_percentage_dead + latino_percentage_dead + white_percentage_dead))) or 
        (black_total != 0 and black_dead != 0 and (black_percentage_pop < (asian_percentage_pop + latino_percentage_pop + white_percentage_pop)) and (black_percentage_dead > (asian_percentage_dead + latino_percentage_dead + white_percentage_dead))) or
        (latino_total != 0 and latino_dead != 0 and (latino_percentage_pop < (black_percentage_pop + asian_percentage_pop + white_percentage_pop)) and (latino_percentage_dead > (black_percentage_dead + asian_percentage_dead + white_percentage_dead))) or
        (white_total != 0 and white_dead != 0 and (white_percentage_pop < (black_percentage_pop + latino_percentage_pop + asian_percentage_pop)) and (white_percentage_dead > (black_percentage_dead + latino_percentage_dead + asian_percentage_dead)))):
      
      # Values by ethnicity (for comparison)
      asian_t.append(asian_total), black_t.append(black_total), latino_t.append(latino_total), white_t.append(white_total)
      asian_d.append(asian_dead), black_d.append(black_dead), latino_d.append(latino_dead), white_d.append(white_dead)

      # Occurrences
      n_occ = asian_total + black_total + latino_total + white_total
      
      if ((asian_percentage_pop < (black_percentage_pop + latino_percentage_pop + white_percentage_pop)) and 
          (asian_percentage_dead > (black_percentage_dead + latino_percentage_dead + white_percentage_dead))):
        # asian
        x_occ = asian_total
        x_mort = asian_dead
        prob_occ = (n_occ - asian_total)/n_occ
        prob_mort = (black_percentage_dead + latino_percentage_dead + white_percentage_dead)/(n_occ - asian_total)
        ethnicity.append("ASIAN")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
      elif ((black_percentage_pop < (asian_percentage_pop + latino_percentage_pop + white_percentage_pop)) and 
            (black_percentage_dead > (asian_percentage_dead + latino_percentage_dead + white_percentage_dead))): 
        # black
        x_occ = black_total
        x_mort = black_dead
        prob_occ = (n_occ - black_total)/n_occ
        prob_mort = (asian_percentage_dead + latino_percentage_dead + white_percentage_dead)/(n_occ - black_total)
        ethnicity.append("BLACK")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
      elif ((latino_percentage_pop < (black_percentage_pop + asian_percentage_pop + white_percentage_pop)) and
            (latino_percentage_dead > (black_percentage_dead + asian_percentage_dead + white_percentage_dead))):
        # latino
        x_occ = latino_total
        x_mort = latino_dead
        prob_occ = (n_occ - latino_total)/n_occ
        prob_mort = (asian_percentage_dead + black_percentage_dead + white_percentage_dead)/(n_occ - latino_total)
        ethnicity.append("LATINO")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])      
      else:
        # white
        x_occ = white_total
        x_mort = white_dead
        prob_occ = (n_occ - white_total)/n_occ
        prob_mort = (asian_percentage_dead + black_percentage_dead + latino_percentage_dead)/(n_occ - white_total)
        ethnicity.append("WHITE")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
         
      # p-value: binom test
      p_value_occ.append(binom_test(x = x_occ, n = n_occ, p = prob_occ, alternative = 'two-sided'))
      p_value_mort.append(binom_test(x = x_mort, n = x_occ, p = prob_mort, alternative = 'two-sided'))

In [19]:
dict = {'Ethnicity': ethnicity, 'ICD9': icd9_code, 'Occ p-value': p_value_occ, 
        'Mort p-value': p_value_mort, 'Asian Total': asian_t, 'Asian Dead': asian_d, 
        'Black Total': black_t, 'Black Dead': black_d, 'Latino Total': latino_t,
        'Latino Dead': latino_d, 'White Total': white_t, 'White Dead': white_d}  
    
binom_test_df = pd.DataFrame(dict)
binom_test_df = binom_test_df.merge(mimic_diagnoses_descriptions_df, left_on='ICD9', right_on='ICD9_CODE')
binom_test_df = binom_test_df.drop(['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE'], axis=1)

pd.set_option("display.max_colwidth", 100)

# null hypothesis rejected: p-value < 0.05
binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]

Unnamed: 0,Ethnicity,ICD9,Occ p-value,Mort p-value,Asian Total,Asian Dead,Black Total,Black Dead,Latino Total,Latino Dead,White Total,White Dead,LONG_TITLE
0,WHITE,03812,3.714032e-59,4.475976e-57,0.0,0.0,98.0,3.0,0.0,0.0,243.0,26.0,Methicillin resistant Staphylococcus aureus septicemia
1,WHITE,0383,4.748198e-43,3.777449e-49,0.0,0.0,51.0,7.0,0.0,0.0,143.0,31.0,Septicemia due to anaerobes
2,BLACK,0388,1.522034e-239,5.955112e-19,0.0,0.0,42.0,8.0,0.0,0.0,314.0,51.0,Other specified septicemias
3,WHITE,04112,2.596110e-60,4.657196e-58,0.0,0.0,158.0,4.0,0.0,0.0,336.0,25.0,Methicillin resistant Staphylococcus aureus in conditions classified elsewhere and of unspecifie...
4,WHITE,04119,9.996822e-12,1.792265e-75,0.0,0.0,510.0,16.0,0.0,0.0,407.0,28.0,"Staphylococcus infection in conditions classified elsewhere and of unspecified site, other staph..."
5,BLACK,0416,2.142157e-172,1.607681e-26,0.0,0.0,86.0,10.0,0.0,0.0,360.0,19.0,Proteus (mirabilis) (morganii) infection in conditions classified elsewhere and of unspecified site
6,BLACK,0417,0.000000e+00,7.340922e-29,0.0,0.0,148.0,11.0,0.0,0.0,767.0,56.0,Pseudomonas infection in conditions classified elsewhere and of unspecified site
7,ASIAN,04186,0.000000e+00,1.741040e-08,26.0,3.0,231.0,4.0,0.0,0.0,192.0,12.0,Helicobacter pylori [H. pylori]
8,BLACK,05479,6.594264e-30,4.919465e-06,0.0,0.0,9.0,3.0,0.0,0.0,48.0,9.0,Herpes simplex with other specified complications
9,WHITE,07051,1.165464e-27,4.653193e-18,0.0,0.0,64.0,4.0,0.0,0.0,140.0,11.0,Acute hepatitis C without mention of hepatic coma
