In [1]:
""" 
Goals: Find the candidate features (ICD codes) and response (ICD code)
Ideally, we want features to have different frequencies for different domains

Explore in the following way:
- Target/source features by (frequency, ratio, description)
- Correlation matrix (target vs source feature correlation)

"""

' \nGoals: Find the candidate features (ICD codes) and response (ICD code)\nIdeally, we want features to have different frequencies for different domains\n\nExplore in the following way:\n- Target/source features by (frequency, ratio, description)\n- Correlation matrix (target vs source feature correlation)\n\n'

In [2]:
import sys
sys.path.append("/home/wanxinli/EHR-OT/")

import collections
from common import *
from ast import literal_eval
# %matplotlib notebook
import matplotlib.pyplot as plt
import math
from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [3]:
""" 
Read in dataset
"""

admid_diagnosis_df = pd.read_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", index_col=0, header=0, converters={'ICD codes': literal_eval})
admid_diagnosis_df

Unnamed: 0_level_0,adm_type,gender,ICD codes,duration,diagnosis,label
admid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
172335,EMERGENCY,F,"[40301, 486, 58281, 5855, 4254, 2762, 7100, 27...",530460.0,LEG PAIN,0
173633,EMERGENCY,F,"[40301, 5856, 58381, 7100, 5589, 2875, 28521, ...",574560.0,ABDOMINAL PAIN,0
174105,EMERGENCY,M,"[53100, 41071, 2859, 41401, 725]",425460.0,GASTROINTESTINAL BLEED,0
109976,EMERGENCY,M,"[1915, 3314, 53081]",1284240.0,HYDROCEPHALUS,0
178393,ELECTIVE,M,"[41401, 4111, 48283, 2859, 2720, 3051]",485280.0,USA/SDA,0
...,...,...,...,...,...,...
172304,ELECTIVE,F,"[20280, 3485, 7843, 4019, 2720, 53081]",449700.0,LEFT BRAIN TUMOR/SDA,0
152542,EMERGENCY,M,"[566, 25062, 3572, V5867, 42731, 4019, 4280, 5...",415740.0,PERIRECTAL ABSCESS,0
161999,EMERGENCY,M,"[43411, 3485, 3484, 430, 34830, 99731, 51883, ...",692940.0,STROKE;TELEMETRY,0
189314,EMERGENCY,F,"[34680, 7843, 7455, 78194, 36840, 7813, 7820, ...",150060.0,STROKE;TELEMETRY;TRANSIENT ISCHEMIC ATTACK,0


In [4]:
""" 
Correlation matrix of codes of females 
Correlation matrix of codes of males

Question: What do we want to get from the correlation matrices?
"""


' \nCorrelation matrix of codes of females \nCorrelation matrix of codes of males\n\nQuestion: What do we want to get from the correlation matrices?\n'

In [5]:
""" 
Target/source features by (frequency, ratio, description)

Keys are features
Values are frequency in female, frequency in male, ICD description

"""
summary_df = pd.read_csv("~/EHR-OT/mimic_exp/mimiciii/D_ICD_DIAGNOSES.csv", index_col=None, header=0)
summary_df = summary_df.drop(['ROW_ID'], axis=1)
summary_df = summary_df.rename(columns={'ICD9_CODE': 'ICD code', 'SHORT_TITLE': 'short title', 'LONG_TITLE': 'long title'})
summary_df = summary_df.set_index(['ICD code'])
summary_df

Unnamed: 0_level_0,short title,long title
ICD code,Unnamed: 1_level_1,Unnamed: 2_level_1
01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...,...
V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
V740,Screening for cholera,Screening examination for cholera
V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...


In [13]:
""" 
Calculates ICD code frequencies for target and source
"""


female_admid_diagnosis_df = admid_diagnosis_df.loc[admid_diagnosis_df['gender'] == 'F']
print("female diagnose shape is:", female_admid_diagnosis_df.shape)
female_all_codes = []
for _, row in female_admid_diagnosis_df.iterrows():
    female_all_codes.extend(row['ICD codes'])
female_freq_dict = dict(collections.Counter(female_all_codes))
# for key in female_freq_dict:
#     female_freq_dict[key] /= female_admid_diagnosis_df.shape[0]

male_admid_diagnosis_df = admid_diagnosis_df.loc[admid_diagnosis_df['gender'] == 'M']
print("male diagnose shape is:", male_admid_diagnosis_df.shape)
male_all_codes = []
for _, row in male_admid_diagnosis_df.iterrows():
    male_all_codes.extend(row['ICD codes'])
male_freq_dict = dict(collections.Counter(male_all_codes))
# for key in male_freq_dict:
#     male_freq_dict[key] /= male_admid_diagnosis_df.shape[0]

female diagnose shape is: (26026, 6)
male diagnose shape is: (32950, 6)


In [14]:
""" 
Add target, source rate to summary_df
"""

# Add column names
summary_df['male freq'] = [np.nan]*summary_df.shape[0]
summary_df['female freq'] = [np.nan]*summary_df.shape[0]
summary_df['male rate'] = [np.nan]*summary_df.shape[0]
summary_df['female rate'] = [np.nan]*summary_df.shape[0]

# Add frequencies
for index, row in summary_df.iterrows():
    if index in male_freq_dict:
        summary_df.at[index, 'male freq'] = male_freq_dict[index]
        summary_df.at[index, 'male rate'] = male_freq_dict[index]
    if index in female_freq_dict:
        summary_df.at[index, 'female freq'] = female_freq_dict[index]
        summary_df.at[index, 'female rate'] = female_freq_dict[index]

summary_df['male rate'] = summary_df.apply(lambda row: row['male rate']/male_admid_diagnosis_df.shape[0], axis=1)
summary_df['female rate'] = summary_df.apply(lambda row: row['female rate']/female_admid_diagnosis_df.shape[0], axis=1)
summary_df['male to female rate ratio'] = summary_df.apply(lambda row: row['male rate']/row['female rate'], axis=1)

# delete rows that have NaN

drop_indices = []
for index, row in summary_df.iterrows():
    if  math.isnan(row['male rate']) or math.isnan(row['female rate']):
        drop_indices.append(index)
summary_df = summary_df.drop(drop_indices)
summary_df.to_csv("../../outputs/mimic/summary_mimic.csv", index=True, header=True)