In [0]:
# Imports
from IPython.display import Image
import pandas as pd
import numpy as np
from scipy.stats import binom_test

In [0]:
PATH = '/content/drive/My Drive/Master/UFMG/Dissertation/Exploration/'

In [3]:
# READ CSV - Method 2 (Drive)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Read tables

### Patients table

In [0]:
mimic_patients_df = pd.read_csv(PATH + 'mimic-III/PATIENTS.csv')

### Admission table

In [0]:
mimic_admission_df = pd.read_csv(PATH + 'mimic-III/ADMISSIONS.csv')

#### Merge both tables and calculate true age

In [0]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'SUBJECT_ID' , how = 'inner')

In [0]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["ADMITTIME"] = pd.to_datetime(mimic_patients_admission_df["ADMITTIME"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["DOB"] = pd.to_datetime(mimic_patients_admission_df["DOB"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["AGES"] = mimic_patients_admission_df["ADMITTIME"].sub(mimic_patients_admission_df["DOB"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df.AGES >= 18)]

### ICD-9 Codes table

In [0]:
mimic_diagnoses_df = pd.read_csv(PATH + 'mimic-III/DIAGNOSES_ICD.csv')

In [0]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = ['SUBJECT_ID', 'HADM_ID'], how = 'inner')

### ICD-9 Descriptions table

In [0]:
mimic_diagnoses_descriptions_df = pd.read_csv(PATH + 'mimic-III/D_ICD_DIAGNOSES.csv')

## Occurrence and Mortality

### By marital status

#### Creating new table with regex


In [0]:
mimic_mt_mortality_df = mimic_patients_diagnoses_df
mimic_mt_mortality_df = mimic_mt_mortality_df.replace(to_replace='^SEPARATED.*', value='DIVORCED', regex=True)
mimic_mt_mortality_df = mimic_mt_mortality_df.replace(to_replace='^LIFE PARTNER.*', value='MARRIED', regex=True)

#### Read table

In [0]:
mt_mortality_df = mimic_mt_mortality_df.groupby(['ICD9_CODE', 'MARITAL_STATUS', 'HOSPITAL_EXPIRE_FLAG']).size().unstack()
mt_mortality_df = mt_mortality_df.reset_index()
mt_mortality_df.columns.names = [None]
mt_mortality_df.columns = ['ICD9_CODE', 'MARITAL_STATUS', 'ALIVE', 'DEAD']
mt_mortality_df.insert(4, 'TOTAL', 'NULL')
mt_mortality_df = mt_mortality_df.fillna(0)

# Compute alive, dead and total
for index, row in mt_mortality_df.iterrows():
  mt_mortality_df.at[index, 'TOTAL'] = row.ALIVE + row.DEAD

#### Data integrity

In [13]:
# missing codes
print("Missing Codes: " + str(len(mimic_mt_mortality_df[mimic_mt_mortality_df['ICD9_CODE'].isnull()])))
print("Missing Status: " + str(len(mimic_mt_mortality_df[mimic_mt_mortality_df['MARITAL_STATUS'].isnull()])))
print("Missing both: " + str(len(mimic_mt_mortality_df[mimic_mt_mortality_df['ICD9_CODE'].isnull() & mimic_mt_mortality_df['MARITAL_STATUS'].isnull()])))
# total patients from original table
print(len(mimic_patients_diagnoses_df))
# total patients from new table
print(mt_mortality_df["TOTAL"].sum())

Missing Codes: 44
Missing Status: 22246
Missing both: 14
569130
546854.0


In [14]:
# All patients
mt_mortality_group = mt_mortality_df.groupby('MARITAL_STATUS')['TOTAL'].sum()
print(mt_mortality_group)

MARITAL_STATUS
DIVORCED              47023.0
MARRIED              272071.0
SINGLE               148403.0
UNKNOWN (DEFAULT)      3503.0
WIDOWED               75854.0
Name: TOTAL, dtype: float64


In [0]:
mt_mortality_df = mt_mortality_df[(mt_mortality_df.MARITAL_STATUS != 'UNKNOWN (DEFAULT)')]

In [0]:
# 1. Mortality significance: only +30 patients dead
mt_mortality_significance_df = mt_mortality_df.loc[mt_mortality_df.DEAD > 30].copy()

# 2. Remove ICD9 codes with only ONE marital status
for index, row in mt_mortality_significance_df.iterrows():
  rows = mt_mortality_significance_df.loc[mt_mortality_significance_df.ICD9_CODE == row.ICD9_CODE]
  if (len(rows) == 1):
    mt_mortality_significance_df.drop(rows.index, inplace=True)

#### Binomial test

In [0]:
# TODO:
# - REFACTORING!!!!! 

# 3. Binomial test
icd9_code, marital_status, p_value_occ, p_value_mort = ([] for i in range(4))
married_t, single_t, divorced_t, widowed_t = ([] for i in range(4))
married_d, single_d, divorced_d, widowed_d = ([] for i in range(4))
icd_pass = True
 
for index, row in mt_mortality_significance_df.iterrows():
  if row.ICD9_CODE not in icd9_code: # icd must be evaluate only once
    icd_pass = True

  if icd_pass:
    rows = mt_mortality_significance_df[mt_mortality_significance_df["ICD9_CODE"] == row.ICD9_CODE]
    icd_pass = False
    
    married_total, single_total, divorced_total, widowed_total = 0, 0, 0, 0
    married_dead, single_dead, divorced_dead, widowed_dead = 0, 0, 0, 0
    married_percentage_dead, single_percentage_dead, divorced_percentage_dead, widowed_percentage_dead = 0, 0, 0, 0
    
    if rows.loc[rows.MARITAL_STATUS == "MARRIED"]["TOTAL"].any():
      married_total = rows.loc[rows.MARITAL_STATUS == "MARRIED"]["TOTAL"].item()
      married_dead = rows.loc[rows.MARITAL_STATUS == "MARRIED"]["DEAD"].item()
      married_percentage_dead = married_dead / married_total
      
    if rows.loc[rows.MARITAL_STATUS == "SINGLE"]["TOTAL"].any():
      single_total = rows.loc[rows.MARITAL_STATUS == "SINGLE"]["TOTAL"].item()
      single_dead = rows.loc[rows.MARITAL_STATUS == "SINGLE"]["DEAD"].item()
      single_percentage_dead = single_dead / single_total
      
    if rows.loc[rows.MARITAL_STATUS == "DIVORCED"]["TOTAL"].any():
      divorced_total = rows.loc[rows.MARITAL_STATUS == "DIVORCED"]["TOTAL"].item()
      divorced_dead = rows.loc[rows.MARITAL_STATUS == "DIVORCED"]["DEAD"].item()
      divorced_percentage_dead = divorced_dead / divorced_total
      
    if rows.loc[rows.MARITAL_STATUS == "WIDOWED"]["TOTAL"].any():
      widowed_total = rows.loc[rows.MARITAL_STATUS == "WIDOWED"]["TOTAL"].item()
      widowed_dead = rows.loc[rows.MARITAL_STATUS == "WIDOWED"]["DEAD"].item()
      widowed_percentage_dead = widowed_dead / widowed_total
      
    married_percentage_pop = married_total/mt_mortality_group.MARRIED
    single_percentage_pop = single_total/mt_mortality_group.SINGLE
    divorced_percentage_pop = divorced_total/mt_mortality_group.DIVORCED
    widowed_percentage_pop = widowed_total/mt_mortality_group.WIDOWED
    
    # If less common in married/single/divorced/widowed (considering total population) but they die more
    if ((married_total != 0 and married_dead != 0 and (married_percentage_pop < (single_percentage_pop + divorced_percentage_pop + widowed_percentage_pop)) and (married_percentage_dead > (single_percentage_dead + divorced_percentage_dead + widowed_percentage_dead))) or 
        (single_total != 0 and single_dead != 0 and (single_percentage_pop < (married_percentage_pop + divorced_percentage_pop + widowed_percentage_pop)) and (single_percentage_dead > (married_percentage_dead + divorced_percentage_dead + widowed_percentage_dead))) or
        (divorced_total != 0 and divorced_dead != 0 and (divorced_percentage_pop < (single_percentage_pop + married_percentage_pop + widowed_percentage_pop)) and (divorced_percentage_dead > (single_percentage_dead + married_percentage_dead + widowed_percentage_dead))) or
        (widowed_total != 0 and widowed_dead != 0 and (widowed_percentage_pop < (single_percentage_pop + divorced_percentage_pop + married_percentage_pop)) and (widowed_percentage_dead > (single_percentage_dead + divorced_percentage_dead + married_percentage_dead)))):
      
      # Values by marital_status (for comparison)
      married_t.append(married_total), single_t.append(single_total), divorced_t.append(divorced_total), widowed_t.append(widowed_total)
      married_d.append(married_dead), single_d.append(single_dead), divorced_d.append(divorced_dead), widowed_d.append(widowed_dead)

      # Occurrences
      n_occ = married_total + single_total + divorced_total + widowed_total
      
      if ((married_percentage_pop < (single_percentage_pop + divorced_percentage_pop + widowed_percentage_pop)) and 
          (married_percentage_dead > (single_percentage_dead + divorced_percentage_dead + widowed_percentage_dead))):
        # married
        x_occ = married_total
        x_mort = married_dead
        prob_occ = (n_occ - married_total)/n_occ
        prob_mort = (single_percentage_dead + divorced_percentage_dead + widowed_percentage_dead)/(n_occ - married_total)
        marital_status.append("MARRIED")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
      elif ((single_percentage_pop < (married_percentage_pop + divorced_percentage_pop + widowed_percentage_pop)) and 
            (single_percentage_dead > (married_percentage_dead + divorced_percentage_dead + widowed_percentage_dead))): 
        # single
        x_occ = single_total
        x_mort = single_dead
        prob_occ = (n_occ - single_total)/n_occ
        prob_mort = (married_percentage_dead + divorced_percentage_dead + widowed_percentage_dead)/(n_occ - single_total)
        marital_status.append("SINGLE")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
      elif ((divorced_percentage_pop < (single_percentage_pop + married_percentage_pop + widowed_percentage_pop)) and
            (divorced_percentage_dead > (single_percentage_dead + married_percentage_dead + widowed_percentage_dead))):
        # divorced
        x_occ = divorced_total
        x_mort = divorced_dead
        prob_occ = (n_occ - divorced_total)/n_occ
        prob_mort = (married_percentage_dead + single_percentage_dead + widowed_percentage_dead)/(n_occ - divorced_total)
        marital_status.append("DIVORCED")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])      
      else:
        # widowed
        x_occ = widowed_total
        x_mort = widowed_dead
        prob_occ = (n_occ - widowed_total)/n_occ
        prob_mort = (married_percentage_dead + single_percentage_dead + divorced_percentage_dead)/(n_occ - widowed_total)
        marital_status.append("WIDOWED")
        icd9_code.append(rows.iloc[0]['ICD9_CODE'])
         
      # p-value: binom test
      p_value_occ.append(binom_test(x = x_occ, n = n_occ, p = prob_occ, alternative = 'two-sided'))
      p_value_mort.append(binom_test(x = x_mort, n = x_occ, p = prob_mort, alternative = 'two-sided'))

In [18]:
dict = {'MaritalStatus': marital_status, 'ICD9': icd9_code, 'Occ p-value': p_value_occ, 
        'Mort p-value': p_value_mort, 'Married Total': married_t, 'Married Dead': married_d, 
        'Single Total': single_t, 'Single Dead': single_d, 'Divorced Total': divorced_t,
        'Divorced Dead': divorced_d, 'Widowed Total': widowed_t, 'Widowed Dead': widowed_d}  
    
binom_test_df = pd.DataFrame(dict)
binom_test_df = binom_test_df.merge(mimic_diagnoses_descriptions_df, left_on='ICD9', right_on='ICD9_CODE')
binom_test_df = binom_test_df.drop(['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE'], axis=1)

pd.set_option("display.max_colwidth", 100)

# null hypothesis rejected: p-value < 0.05
binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]

Unnamed: 0,MaritalStatus,ICD9,Occ p-value,Mort p-value,Married Total,Married Dead,Single Total,Single Dead,Divorced Total,Divorced Dead,Widowed Total,Widowed Dead,LONG_TITLE
0,MARRIED,03811,1.210405e-21,1.250432e-105,217.0,59.0,129.0,34.0,0,0,0.0,0.0,Methicillin susceptible Staphylococcus aureus septicemia
1,MARRIED,07054,5.618883000000001e-54,6.740641e-125,353.0,48.0,588.0,51.0,0,0,0.0,0.0,Chronic hepatitis C without mention of hepatic coma
2,SINGLE,1970,4.245252e-135,3.4032969999999997e-116,469.0,112.0,169.0,48.0,0,0,0.0,0.0,Secondary malignant neoplasm of lung
3,SINGLE,1977,3.133007e-166,2.425116e-118,484.0,138.0,154.0,49.0,0,0,0.0,0.0,"Malignant neoplasm of liver, secondary"
4,MARRIED,2639,6.703985e-45,5.291523e-201,531.0,89.0,327.0,42.0,0,0,0.0,0.0,Unspecified protein-calorie malnutrition
5,MARRIED,2763,1.482212e-27,2.86885e-130,422.0,60.0,279.0,32.0,0,0,0.0,0.0,Alkalosis
6,MARRIED,2764,5.430634e-06,3.1801629999999998e-136,209.0,66.0,165.0,35.0,0,0,0.0,0.0,Mixed acid-base balance disorder
7,MARRIED,2768,3.132928e-08,5.813547e-163,550.0,65.0,462.0,33.0,0,0,0.0,0.0,Hypopotassemia
8,MARRIED,2867,2.223608e-05,2.140089e-126,197.0,66.0,157.0,48.0,0,0,0.0,0.0,Acquired coagulation factor deficiency
9,MARRIED,2948,0.0002395058,8.049599e-101,279.0,48.0,0.0,0.0,0,0,237.0,34.0,Other persistent mental disorders due to conditions classified elsewhere


In [19]:
print(len(binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]))

35


In [20]:
# Only cases with transplanted organs
pd.set_option("display.max_colwidth", 500)
binom_test_df.loc[binom_test_df.iloc[:, -1].str.contains(r'transplant')]

Unnamed: 0,MaritalStatus,ICD9,Occ p-value,Mort p-value,Married Total,Married Dead,Single Total,Single Dead,Divorced Total,Divorced Dead,Widowed Total,Widowed Dead,LONG_TITLE


In [21]:
# Only cases with heart problems - CODE ?
binom_test_df.loc[binom_test_df.iloc[:, -1].str.contains(r'heart')]

Unnamed: 0,MaritalStatus,ICD9,Occ p-value,Mort p-value,Married Total,Married Dead,Single Total,Single Dead,Divorced Total,Divorced Dead,Widowed Total,Widowed Dead,LONG_TITLE
