In [0]:
# Imports
from IPython.display import Image
import pandas as pd
import numpy as np
from scipy.stats import binom_test

In [0]:
PATH = '/content/drive/My Drive/Master/UFMG/Dissertation/Exploration/'

In [76]:
# READ CSV - Method 2 (Drive)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Read tables

In [0]:
pd.set_option("display.max_colwidth", 100)

### Patients table

In [0]:
mimic_patients_df = pd.read_csv(PATH + 'mimic-III/PATIENTS.csv')

### Admission table

In [0]:
mimic_admission_df = pd.read_csv(PATH + 'mimic-III/ADMISSIONS.csv')

#### Merge both tables and calculate true age

In [0]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'SUBJECT_ID' , how = 'inner')

In [0]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["ADMITTIME"] = pd.to_datetime(mimic_patients_admission_df["ADMITTIME"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["DOB"] = pd.to_datetime(mimic_patients_admission_df["DOB"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["AGES"] = mimic_patients_admission_df["ADMITTIME"].sub(mimic_patients_admission_df["DOB"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df.AGES >= 18)]

### ICD-9 Codes table

In [0]:
mimic_diagnoses_df = pd.read_csv(PATH + 'mimic-III/DIAGNOSES_ICD.csv')

In [0]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = 'SUBJECT_ID' , how = 'inner')

### ICD-9 Descriptions table

In [0]:
mimic_diagnoses_descriptions_df = pd.read_csv(PATH + 'mimic-III/D_ICD_DIAGNOSES.csv')

## Occurrence and Mortality

### By race

#### Creating new table with regex


In [0]:
# race_mortality_df = mimic_patients_diagnoses_df

# race_mortality_df = race_mortality_df.replace(to_replace='^BLACK.*', value='BLACK', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^WHITE.*', value='WHITE', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^ASIAN.*', value='ASIAN', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^LATINO.*', value='LATINO', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^HISPANIC.*', value='LATINO', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^SOUTH AMERICAN.*', value='LATINO', regex=True)

# race_mortality_df = race_mortality_df.replace(to_replace='^CARIBBEAN ISLAND.*', value='OTHER', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^AMERICAN INDIAN.*', value='OTHER', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^MIDDLE EASTERN.*', value='OTHER', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^MULTI RACE ETHNICITY.*', value='OTHER', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER.*', value='OTHER', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^PORTUGUESE.*', value='OTHER', regex=True)

# race_mortality_df = race_mortality_df.replace(to_replace='^UNABLE TO OBTAIN.*', value='UNKNOWN', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^UNKNOWN.*', value='UNKNOWN', regex=True)
# race_mortality_df = race_mortality_df.replace(to_replace='^PATIENT DECLINED TO ANSWER.*', value='UNKNOWN', regex=True)

# race_mortality_df.to_csv('RACE_MORTALITY.csv', index = None, header=True)
# !cp RACE_MORTALITY.csv '/content/drive/My Drive/Mestrado/UFMG/Dissertação/Análises Exploratórias/mimic-III/'

#### Read table

In [0]:
mimic_race_mortality_df = pd.read_csv(PATH + 'mimic-III/RACE_MORTALITY.csv')

In [0]:
race_mortality_df = mimic_race_mortality_df.groupby(['ICD9_CODE', 'ETHNICITY', 'HOSPITAL_EXPIRE_FLAG']).size().unstack()
race_mortality_df = race_mortality_df.reset_index()
race_mortality_df.columns.names = [None]
race_mortality_df.columns = ['ICD9_CODE', 'RACE', 'ALIVE', 'DEAD']
race_mortality_df.insert(4, 'TOTAL', 'NULL')


# Compute alive, dead and total
for index, row in race_mortality_df.iterrows():
  race_mortality_df.at[index, 'TOTAL'] = row.ALIVE + row.DEAD
  

race_mortality_group = race_mortality_df.groupby('RACE')['DEAD'].count()

In [0]:
race_mortality_df = race_mortality_df[(race_mortality_df.RACE != 'OTHER') & (race_mortality_df.RACE != 'UNKNOWN')]

In [89]:
# 1. Mortality significance: only +2 patients dead
race_mortality_significance_df = race_mortality_df[race_mortality_df['DEAD'] > 2]


# 2. Remove ICD9 codes with only ONE race
for index, row in race_mortality_significance_df.iterrows():
  rows = race_mortality_significance_df[race_mortality_significance_df["ICD9_CODE"] == row.ICD9_CODE]
  if (len(rows) == 1):
    race_mortality_significance_df.drop(rows.index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [0]:
# 3. Binomial test

icd9_code, race, p_value_occ, p_value_mort = ([] for i in range(4))
asian_t, black_t, latino_t, white_t = ([] for i in range(4))
asian_d, black_d, latino_d, white_d = ([] for i in range(4))

icd_pass = False
 
for index, row in race_mortality_significance_df.iterrows():
  if not icd_pass:
    rows = race_mortality_significance_df[race_mortality_significance_df["ICD9_CODE"] == row.ICD9_CODE]
    icd_pass = True
    
    asian_total, black_total, latino_total, white_total = 0, 0, 0, 0
    asian_dead, black_dead, latino_dead, white_dead = 0, 0, 0, 0
    asian_percentage_dead, black_percentage_dead, latino_percentage_dead, white_percentage_dead = 0, 0, 0, 0
    
    if rows.loc[rows.RACE == "ASIAN"]["TOTAL"].any():
      asian_total = rows.loc[rows.RACE == "ASIAN"]["TOTAL"].item()
      asian_dead = rows.loc[rows.RACE == "ASIAN"]["DEAD"].item()
      asian_percentage_dead = asian_dead / asian_total
      
    if rows.loc[rows.RACE == "BLACK"]["TOTAL"].any():
      black_total = rows.loc[rows.RACE == "BLACK"]["TOTAL"].item()
      black_dead = rows.loc[rows.RACE == "BLACK"]["DEAD"].item()
      black_percentage_dead = black_dead / black_total
      
    if rows.loc[rows.RACE == "LATINO"]["TOTAL"].any():
      latino_total = rows.loc[rows.RACE == "LATINO"]["TOTAL"].item()
      latino_dead = rows.loc[rows.RACE == "LATINO"]["DEAD"].item()
      latino_percentage_dead = latino_dead / latino_total
      
    if rows.loc[rows.RACE == "WHITE"]["TOTAL"].any():
      white_total = rows.loc[rows.RACE == "WHITE"]["TOTAL"].item()
      white_dead = rows.loc[rows.RACE == "WHITE"]["DEAD"].item()
      white_percentage_dead = white_dead / white_total
      
    asian_percentage_pop = asian_total/race_mortality_group.ASIAN
    black_percentage_pop = black_total/race_mortality_group.BLACK
    latino_percentage_pop = latino_total/race_mortality_group.LATINO
    white_percentage_pop = white_total/race_mortality_group.WHITE
    
    # If less common in asian/black/latino/white (considering total population) but they die more
    if ((asian_total != 0 and asian_dead != 0 and (asian_percentage_pop < (black_percentage_pop + latino_percentage_pop + white_percentage_pop)) and (asian_percentage_dead > (black_percentage_dead + latino_percentage_dead + white_percentage_dead))) or 
        (black_total != 0 and black_dead != 0 and (black_percentage_pop < (asian_percentage_pop + latino_percentage_pop + white_percentage_pop)) and (black_percentage_dead > (asian_percentage_dead + latino_percentage_dead + white_percentage_dead))) or
        (latino_total != 0 and latino_dead != 0 and (latino_percentage_pop < (black_percentage_pop + asian_percentage_pop + white_percentage_pop)) and (latino_percentage_dead > (black_percentage_dead + asian_percentage_dead + white_percentage_dead))) or
        (white_total != 0 and white_dead != 0 and (white_percentage_pop < (black_percentage_pop + latino_percentage_pop + asian_percentage_pop)) and (white_percentage_dead > (black_percentage_dead + latino_percentage_dead + asian_percentage_dead)))):
      
      # Values by race (for comparison)
      asian_t.append(asian_total), black_t.append(black_total), latino_t.append(latino_total), white_t.append(white_total)
      asian_d.append(asian_dead), black_d.append(black_dead), latino_d.append(latino_dead), white_d.append(white_dead)

      # Occurrences
      n_occ = asian_total + black_total + latino_total + white_total
      
      if ((asian_percentage_pop < (black_percentage_pop + latino_percentage_pop + white_percentage_pop)) and 
          (asian_percentage_dead > (black_percentage_dead + latino_percentage_dead + white_percentage_dead))):
        x_occ = asian_total
        x_mort = asian_dead
        prob_occ = asian_total/n_occ
        prob_mort = asian_percentage_dead
        race.append("ASIAN")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
      elif (): # black
      elif (): # latino
      else: # white
         
      # p-value: binom test
      p_value_occ.append(binom_test(x = x_occ, n = n_occ, p = prob_occ, alternative = 'two-sided'))
      p_value_mort.append(binom_test(x = x_mort, n = x_occ, p = prob_mort, alternative = 'two-sided'))
  else:
    icd_pass = False

In [0]:
# dict = {'Gender': gender, 'ICD9': icd9_code, 'Occ p-value': p_value_occ, 
#         'Mort p-value': p_value_mort, 'Alive F': alive_f, 'Dead F': dead_f, 
#         'Total F': total_f, 'Alive M': alive_m, 'Dead M': dead_m, 'Total M': total_m,}  
    
# binom_test_df = pd.DataFrame(dict)
# binom_test_df = binom_test_df.merge(mimic_diagnoses_descriptions_df, left_on='ICD9', right_on='ICD9_CODE')
# binom_test_df = binom_test_df.drop(['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE'], axis=1)

# # 5%
# binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]