In [0]:
# Imports
from IPython.display import Image
import pandas as pd
import numpy as np
from scipy.stats import binom_test

In [0]:
PATH = '/content/drive/My Drive/Master/UFMG/Dissertation/Exploration/'

In [0]:
# READ CSV - Method 2 (Drive)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read tables

In [0]:
pd.set_option("display.max_colwidth", 100)

### Patients table

In [0]:
mimic_patients_df = pd.read_csv(PATH + 'mimic-III/PATIENTS.csv')

### Admission table

In [0]:
mimic_admission_df = pd.read_csv(PATH + 'mimic-III/ADMISSIONS.csv')

#### Merge both tables and calculate true age

In [0]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'SUBJECT_ID' , how = 'inner')

In [0]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["ADMITTIME"] = pd.to_datetime(mimic_patients_admission_df["ADMITTIME"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["DOB"] = pd.to_datetime(mimic_patients_admission_df["DOB"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["AGES"] = mimic_patients_admission_df["ADMITTIME"].sub(mimic_patients_admission_df["DOB"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df.AGES >= 18)]

### ICD-9 Codes table

In [0]:
mimic_diagnoses_df = pd.read_csv(PATH + 'mimic-III/DIAGNOSES_ICD.csv')

In [0]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = 'SUBJECT_ID' , how = 'inner')

### ICD-9 Descriptions table

In [0]:
mimic_diagnoses_descriptions_df = pd.read_csv(PATH + 'mimic-III/D_ICD_DIAGNOSES.csv')

## Occurrence and Mortality

### By gender

In [0]:
# New table
gender_mortality_df = mimic_patients_diagnoses_df.groupby(['ICD9_CODE', 'GENDER', 'HOSPITAL_EXPIRE_FLAG']).size().unstack()

gender_mortality_df = gender_mortality_df.reset_index()
gender_mortality_df.columns.names = [None]
gender_mortality_df.columns = ['ICD9_CODE', 'GENDER', 'ALIVE', 'DEAD']
gender_mortality_df.insert(4, 'TOTAL', 'NULL')


# Compute alive, dead and total
for index, row in gender_mortality_df.iterrows():
  gender_mortality_df.at[index, 'TOTAL'] = row.ALIVE + row.DEAD
  
  
# 1. Mortality significance: only +30 patients dead
gender_mortality_df = gender_mortality_df[gender_mortality_df['DEAD'] > 30]


# 2. Remove rows without pair (F and M)
for index, row in gender_mortality_df.iterrows():
  rows_gender = gender_mortality_df[gender_mortality_df["ICD9_CODE"] == row.ICD9_CODE]
  if (len(rows_gender) == 1):
    gender_mortality_df.drop(rows_gender.index, inplace=True)

In [0]:
# 3. Binomial test

icd9_code, gender, p_value_occ, p_value_mort = ([] for i in range(4))
alive_m, dead_m, total_m, alive_f, dead_f, total_f = ([] for i in range(6))

icd_pass = False
 
for index, row in gender_mortality_df.iterrows():
  if not icd_pass:
    rows_gender = gender_mortality_df[gender_mortality_df["ICD9_CODE"] == row.ICD9_CODE]
    icd_pass = True
    
    # If less common in women/men but they die more
    if(((rows_gender.iloc[0]["TOTAL"] < rows_gender.iloc[1]["TOTAL"]) and (rows_gender.iloc[0]["DEAD"] > rows_gender.iloc[1]["DEAD"])) or
       ((rows_gender.iloc[1]["TOTAL"] < rows_gender.iloc[0]["TOTAL"]) and (rows_gender.iloc[1]["DEAD"] > rows_gender.iloc[0]["DEAD"]))): 
      
      n_occ = rows_gender.iloc[0]["TOTAL"] + rows_gender.iloc[1]["TOTAL"]
      
      alive_f.append(rows_gender.iloc[0]["ALIVE"])
      dead_f.append(rows_gender.iloc[0]["DEAD"])
      total_f.append(rows_gender.iloc[0]["TOTAL"])
      
      alive_m.append(rows_gender.iloc[1]["ALIVE"])
      dead_m.append(rows_gender.iloc[1]["DEAD"])
      total_m.append(rows_gender.iloc[1]["TOTAL"])
      
      
      if((rows_gender.iloc[0]["TOTAL"] < rows_gender.iloc[1]["TOTAL"]) and (rows_gender.iloc[0]["DEAD"] > rows_gender.iloc[1]["DEAD"])):
        x_occ = rows_gender.iloc[0]["TOTAL"] # Women
        x_mort = rows_gender.iloc[0]["DEAD"]
        gender.append('F')
        icd9_code.append(rows_gender.iloc[0]['ICD9_CODE'])
        prob_occ = rows_gender.iloc[1]["TOTAL"]/n_occ
        prob_mort = rows_gender.iloc[1]["DEAD"]/rows_gender.iloc[1]["TOTAL"]
      else:
        x_occ = rows_gender.iloc[1]["TOTAL"] # Men
        x_mort = rows_gender.iloc[1]["DEAD"]
        gender.append('M')
        icd9_code.append(rows_gender.iloc[1]['ICD9_CODE'])
        prob_occ = rows_gender.iloc[0]["TOTAL"]/n_occ
        prob_mort = rows_gender.iloc[0]["DEAD"]/rows_gender.iloc[0]["TOTAL"]
         
      # p-value: binom test
      p_value_occ.append(binom_test(x = x_occ, n = n_occ, p = prob_occ, alternative = 'two-sided'))
      p_value_mort.append(binom_test(x = x_mort, n = x_occ, p = prob_mort, alternative = 'two-sided'))
  else:
    icd_pass = False

In [0]:
dict = {'Gender': gender, 'ICD9': icd9_code, 'Occ p-value': p_value_occ, 
        'Mort p-value': p_value_mort, 'Alive F': alive_f, 'Dead F': dead_f, 
        'Total F': total_f, 'Alive M': alive_m, 'Dead M': dead_m, 'Total M': total_m,}  
    
binom_test_df = pd.DataFrame(dict)
binom_test_df = binom_test_df.merge(mimic_diagnoses_descriptions_df, left_on='ICD9', right_on='ICD9_CODE')
binom_test_df = binom_test_df.drop(['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE'], axis=1)

# 5%
binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]

Unnamed: 0,Gender,ICD9,Occ p-value,Mort p-value,Alive F,Dead F,Total F,Alive M,Dead M,Total M,LONG_TITLE
1,M,03842,2.84614e-05,0.0001703138,447.0,34.0,481.0,367.0,51.0,418.0,Septicemia due to escherichia coli [E. coli]
4,M,2753,1.08417e-07,0.009097393,609.0,36.0,645.0,507.0,46.0,553.0,Disorders of phosphorus metabolism
8,M,27801,3.262044e-85,3.505908e-12,1621.0,61.0,1682.0,1070.0,93.0,1163.0,Morbid obesity
9,F,2809,1.589314e-05,0.001395984,1390.0,62.0,1452.0,1527.0,44.0,1571.0,"Iron deficiency anemia, unspecified"
10,M,28529,1.829638e-101,3.965927e-10,1879.0,77.0,1956.0,1242.0,103.0,1345.0,Anemia of other chronic disease
12,F,34290,6.354911e-09,0.001280155,236.0,50.0,286.0,320.0,40.0,360.0,"Hemiplegia, unspecified, affecting unspecified side"
13,F,34590,0.0008781956,0.00683055,1298.0,75.0,1373.0,1404.0,58.0,1462.0,"Epilepsy, unspecified, without mention of intractable epilepsy"
15,F,42821,2.686172e-41,2.74893e-05,239.0,43.0,282.0,428.0,36.0,464.0,Acute systolic heart failure
17,F,49121,5.458153e-86,5.928849e-05,1343.0,118.0,1461.0,1927.0,113.0,2040.0,Obstructive chronic bronchitis with (acute) exacerbation
18,M,49320,4.372009e-149,1.99463e-06,1027.0,49.0,1076.0,507.0,52.0,559.0,"Chronic obstructive asthma, unspecified"
