In [0]:
# Imports
from IPython.display import Image
import pandas as pd
import numpy as np
from scipy.stats import binom_test

In [0]:
PATH = '/content/drive/My Drive/Master/UFMG/Dissertation/Exploration/'

In [3]:
# READ CSV - Method 2 (Drive)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read tables

In [0]:
pd.set_option("display.max_colwidth", 100)

### Patients table

In [0]:
mimic_patients_df = pd.read_csv(PATH + 'mimic-III/PATIENTS.csv')

### Admission table

In [0]:
mimic_admission_df = pd.read_csv(PATH + 'mimic-III/ADMISSIONS.csv')

#### Merge both tables and calculate true age

In [0]:
# Merge patient and admission tables
mimic_patients_admission_df = pd.merge(mimic_admission_df, mimic_patients_df, on = 'SUBJECT_ID' , how = 'inner')

In [0]:
# ADMITTIME and DOB objects to datetime
mimic_patients_admission_df["ADMITTIME"] = pd.to_datetime(mimic_patients_admission_df["ADMITTIME"], format='%Y%m%d %H:%M:%S')
mimic_patients_admission_df["DOB"] = pd.to_datetime(mimic_patients_admission_df["DOB"], format='%Y%m%d %H:%M:%S')

# True age
mimic_patients_admission_df["AGES"] = mimic_patients_admission_df["ADMITTIME"].sub(mimic_patients_admission_df["DOB"]).dt.days/365.242

# Adult patients only
mimic_patients_admission_df = mimic_patients_admission_df[(mimic_patients_admission_df.AGES >= 18)]

### ICD-9 Codes table

In [0]:
mimic_diagnoses_df = pd.read_csv(PATH + 'mimic-III/DIAGNOSES_ICD.csv')

In [0]:
mimic_patients_diagnoses_df = pd.merge(mimic_patients_admission_df, mimic_diagnoses_df, on = ['SUBJECT_ID', 'HADM_ID'], how = 'inner')

### Unique patient with multiple rows with same ICD9

In [0]:
# mimic_patients_unique_diagnoses_df = mimic_patients_diagnoses_df
# mimic_patients_unique_diagnoses_df.drop_duplicates(["SUBJECT_ID", "ICD9_CODE"], inplace=True)

In [0]:
# Patient with duplicated icd9
# mimic_patients_unique_diagnoses_df.loc[(mimic_patients_unique_diagnoses_df["SUBJECT_ID"] == 5882) & (mimic_patients_unique_diagnoses_df["ICD9_CODE"] == "99682")]

### ICD-9 Descriptions table

In [0]:
mimic_diagnoses_descriptions_df = pd.read_csv(PATH + 'mimic-III/D_ICD_DIAGNOSES.csv')

## Occurrence and Mortality

### By gender

In [0]:
# New table
gender_mortality_df = mimic_patients_diagnoses_df.groupby(['ICD9_CODE', 'GENDER', 'HOSPITAL_EXPIRE_FLAG']).size().unstack()

gender_mortality_df = gender_mortality_df.reset_index()
gender_mortality_df.columns.names = [None]
gender_mortality_df.columns = ['ICD9_CODE', 'GENDER', 'ALIVE', 'DEAD']
gender_mortality_df.insert(4, 'TOTAL', 'NULL')
gender_mortality_df = gender_mortality_df.fillna(0)

# Compute alive, dead and total
for index, row in gender_mortality_df.iterrows():
  gender_mortality_df.at[index, 'TOTAL'] = row.ALIVE + row.DEAD
    
# 1. Mortality significance: only +30 patients dead
gender_mortality_df = gender_mortality_df.loc[gender_mortality_df.DEAD > 30].copy()

# 2. Remove rows without pair (F and M)
for index, row in gender_mortality_df.iterrows():
  rows_gender = gender_mortality_df.loc[gender_mortality_df.ICD9_CODE == row.ICD9_CODE]
  if (len(rows_gender) == 1):
    gender_mortality_df.drop(rows_gender.index, inplace=True)

#### Binomial test

In [0]:
# TODO:
# - REFACTORING!!!!! 

# 3. Binomial test
icd9_code, gender, p_value_occ, p_value_mort = ([] for i in range(4))
alive_m, dead_m, total_m, alive_f, dead_f, total_f = ([] for i in range(6))

icd_pass = False
 
for index, row in gender_mortality_df.iterrows():
  if not icd_pass:
    rows_gender = gender_mortality_df[gender_mortality_df["ICD9_CODE"] == row.ICD9_CODE]
    icd_pass = True
    
    # If less common in women/men but they die more
    if(((rows_gender.iloc[0]["TOTAL"] < rows_gender.iloc[1]["TOTAL"]) and (rows_gender.iloc[0]["DEAD"] > rows_gender.iloc[1]["DEAD"])) or
       ((rows_gender.iloc[1]["TOTAL"] < rows_gender.iloc[0]["TOTAL"]) and (rows_gender.iloc[1]["DEAD"] > rows_gender.iloc[0]["DEAD"]))): 
      
      n_occ = rows_gender.iloc[0]["TOTAL"] + rows_gender.iloc[1]["TOTAL"]
      
      alive_f.append(rows_gender.iloc[0]["ALIVE"])
      dead_f.append(rows_gender.iloc[0]["DEAD"])
      total_f.append(rows_gender.iloc[0]["TOTAL"])
      
      alive_m.append(rows_gender.iloc[1]["ALIVE"])
      dead_m.append(rows_gender.iloc[1]["DEAD"])
      total_m.append(rows_gender.iloc[1]["TOTAL"])
      
      
      if((rows_gender.iloc[0]["TOTAL"] < rows_gender.iloc[1]["TOTAL"]) and (rows_gender.iloc[0]["DEAD"] > rows_gender.iloc[1]["DEAD"])):
        x_occ = rows_gender.iloc[0]["TOTAL"] # Women
        x_mort = rows_gender.iloc[0]["DEAD"]
        gender.append('F')
        icd9_code.append(rows_gender.iloc[0]['ICD9_CODE'])
        prob_occ = rows_gender.iloc[1]["TOTAL"]/n_occ
        prob_mort = rows_gender.iloc[1]["DEAD"]/rows_gender.iloc[1]["TOTAL"]
      else:
        x_occ = rows_gender.iloc[1]["TOTAL"] # Men
        x_mort = rows_gender.iloc[1]["DEAD"]
        gender.append('M')
        icd9_code.append(rows_gender.iloc[1]['ICD9_CODE'])
        prob_occ = rows_gender.iloc[0]["TOTAL"]/n_occ
        prob_mort = rows_gender.iloc[0]["DEAD"]/rows_gender.iloc[0]["TOTAL"]
         
      # p-value: binom test
      # H0: occ_women = p(occ_men) and vice-versa
      p_value_occ.append(binom_test(x = x_occ, n = n_occ, p = prob_occ, alternative = 'two-sided'))
      p_value_mort.append(binom_test(x = x_mort, n = x_occ, p = prob_mort, alternative = 'two-sided'))
  else:
    icd_pass = False

In [16]:
dict = {'Gender': gender, 'ICD9': icd9_code, 'Occ p-value': p_value_occ, 
        'Mort p-value': p_value_mort, 'Alive F': alive_f, 'Dead F': dead_f, 
        'Total F': total_f, 'Alive M': alive_m, 'Dead M': dead_m, 'Total M': total_m}  
    
binom_test_df = pd.DataFrame(dict)
binom_test_df = binom_test_df.merge(mimic_diagnoses_descriptions_df, left_on='ICD9', right_on='ICD9_CODE')
binom_test_df = binom_test_df.drop(['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE'], axis=1)

# null hypothesis rejected: p-value < 0.05
# occ_women != p(occ_men) and mort_women != p(mort_men)
binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]

Unnamed: 0,Gender,ICD9,Occ p-value,Mort p-value,Alive F,Dead F,Total F,Alive M,Dead M,Total M,LONG_TITLE
2,M,27801,3.576236e-13,2.5e-05,603.0,42.0,645.0,461.0,60.0,521.0,Morbid obesity
4,F,34290,0.001330791,0.000223,155.0,48.0,203.0,204.0,33.0,237.0,"Hemiplegia, unspecified, affecting unspecified side"
5,F,4240,8.273525e-08,0.025959,1183.0,107.0,1290.0,1334.0,96.0,1430.0,Mitral valve disorders
7,F,45829,4.610106e-26,0.030672,836.0,55.0,891.0,1076.0,52.0,1128.0,Other iatrogenic hypotension
8,F,5121,6.719299e-22,0.007318,273.0,45.0,318.0,408.0,43.0,451.0,Iatrogenic pneumothorax
10,F,53081,2.802944e-35,9e-06,2479.0,222.0,2701.0,2983.0,193.0,3176.0,Esophageal reflux
11,F,6826,3.386256e-15,0.000176,247.0,46.0,293.0,361.0,35.0,396.0,"Cellulitis and abscess of leg, except foot"
16,M,V5865,9.691058e-08,4.7e-05,420.0,47.0,467.0,324.0,65.0,389.0,Long-term (current) use of steroids


In [17]:
print(len(binom_test_df[(binom_test_df['Occ p-value'] < 0.05) & (binom_test_df['Mort p-value'] < 0.05)]))

8


In [18]:
# Only cases with transplanted organs and heart problems
binom_test_df.loc[(binom_test_df.iloc[:, -1].str.contains(r'transplant')) |
                  (binom_test_df.iloc[:, -1].str.contains(r'heart'))]

Unnamed: 0,Gender,ICD9,Occ p-value,Mort p-value,Alive F,Dead F,Total F,Alive M,Dead M,Total M,LONG_TITLE
