In [2]:
""" 
Goals: Find the candidate features (ICD codes) and response (ICD code)
Ideally, we want features to have different frequencies for different domains

Explore in the following way:
- Target/source features by (frequency, ratio, description)
- Correlation matrix (target vs source feature correlation)

"""

' \nGoals: Find the candidate features (ICD codes) and response (ICD code)\nIdeally, we want features to have different frequencies for different domains\n\nExplore in the following way:\n- Target/source features by (frequency, ratio, description)\n- Correlation matrix (target vs source feature correlation)\n\n'

In [3]:
import sys
sys.path.append("/home/wanxinli/deep_patient/")

import collections
from common import *
from ast import literal_eval
# %matplotlib notebook
import matplotlib.pyplot as plt
import math
from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
""" 
Read in dataset
"""

admid_diagnosis_df = pd.read_csv("../../outputs/mimic/ADMID_DIAGNOSIS.csv", index_col=0, header=0, converters={'ICD codes': literal_eval})
admid_diagnosis_df

Unnamed: 0_level_0,adm_type,gender,ICD codes,duration,diagnosis,label
admid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
172335,EMERGENCY,F,"[403.01, 486, 582.81, 585.5, 425.4, 276.2, 710...",530460.0,LEG PAIN,0
173633,EMERGENCY,F,"[403.01, 585.6, 583.81, 710.0, 558.9, 287.5, 2...",574560.0,ABDOMINAL PAIN,0
174105,EMERGENCY,M,"[531.00, 410.71, 285.9, 414.01, 725]",425460.0,GASTROINTESTINAL BLEED,0
109976,EMERGENCY,M,"[191.5, 331.4, 530.81]",1284240.0,HYDROCEPHALUS,0
178393,ELECTIVE,M,"[414.01, 411.1, 482.83, 285.9, 272.0, 305.1]",485280.0,USA/SDA,0
...,...,...,...,...,...,...
172304,ELECTIVE,F,"[202.80, 348.5, 784.3, 401.9, 272.0, 530.81]",449700.0,LEFT BRAIN TUMOR/SDA,0
152542,EMERGENCY,M,"[566, 250.62, 357.2, V58.67, 427.31, 401.9, 42...",415740.0,PERIRECTAL ABSCESS,0
161999,EMERGENCY,M,"[434.11, 348.5, 348.4, 430, 348.30, 997.31, 51...",692940.0,STROKE;TELEMETRY,0
189314,EMERGENCY,F,"[346.80, 784.3, 745.5, 781.94, 368.40, 781.3, ...",150060.0,STROKE;TELEMETRY;TRANSIENT ISCHEMIC ATTACK,0


In [5]:
""" 
Correlation matrix of codes of females 
Correlation matrix of codes of males

Question: What do we want to get from the correlation matrices?
"""


' \nCorrelation matrix of codes of females \nCorrelation matrix of codes of males\n\nQuestion: What do we want to get from the correlation matrices?\n'

In [6]:
""" 
Target/source features by (frequency, ratio, description)

Keys are features
Values are frequency in female, frequency in male, ICD description

"""
summary_df = pd.read_csv("~/deep_patient/mimic_exp/mimiciii/D_ICD_DIAGNOSES.csv", index_col=None, header=0)
summary_df = summary_df.drop(['ROW_ID'], axis=1)
summary_df = summary_df.rename(columns={'ICD9_CODE': 'ICD code', 'SHORT_TITLE': 'short title', 'LONG_TITLE': 'long title'})
summary_df = summary_df.set_index(['ICD code'])
summary_df

Unnamed: 0_level_0,short title,long title
ICD code,Unnamed: 1_level_1,Unnamed: 2_level_1
01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...,...
V7399,Scrn unspcf viral dis,Special screening examination for unspecified ...
V740,Screening for cholera,Screening examination for cholera
V741,Screening-pulmonary TB,Screening examination for pulmonary tuberculosis
V742,Screening for leprosy,Screening examination for leprosy (Hansen's di...


In [7]:
def divide_dataset(divide_feature, group_1, group_2):
    """ 
    Divide dataset into group 1 (key value group_1) and group 2 (key value group_2) \
        by dividing feature divide_feature
    """
    group_1_df = admid_diagnosis_df.loc[admid_diagnosis_df[divide_feature] == group_1]
    group_1_codes = []
    for _, row in group_1_df.iterrows():
        group_1_codes.extend(row['ICD codes'])
    group_1_freq_dict = dict(collections.Counter(group_1_codes))
    group_1_total = sum(group_1_freq_dict.values())
    group_1_freq_dict = {k: v / group_1_total for k, v in group_1_freq_dict.items()}

    group_2_df = admid_diagnosis_df.loc[admid_diagnosis_df[divide_feature] == group_2]
    group_2_codes = []
    for _, row in group_2_df.iterrows():
        group_2_codes.extend(row['ICD codes'])
    group_2_freq_dict = dict(collections.Counter(group_2_codes))
    group_2_total = sum(group_2_freq_dict.values())
    group_2_freq_dict = {k: v / group_2_total for k, v in group_2_freq_dict.items()}

    return group_1_freq_dict, group_2_freq_dict, group_1_total, group_2_total


In [8]:
""" 
Calculates ICD code frequencies for target and source
"""
male_freq_dict, female_freq_dict, male_total, female_total = divide_dataset('gender', 'M', 'F')
print("male_total is:", male_total)
print("female_total is:", female_total)

male_total is: 362773
female_total is: 288274


In [9]:
def find_differ_codes(dict_1, dict_2, diff_percent):
    """ 
    Find codes in dictionary 1 (dict_1) and and dictionary 2 (dict 2) \
        with relative frequencies differing by diff_percent, 
    
    Returns: the codes differing by diff_percent
    """
    diff_codes = []
    for key, value in dict_1.items():
        # print(key, value)
        if key in dict_2:
            if abs(dict_2[key] - value) > diff_percent:
                diff_codes.append(key)
                print(f"code {key}: dictionary 1 frequency - {value}, dictionary 2 frequency - {dict_2[key]}")
        elif value > diff_percent:
            diff_codes.append
            print(f"code {key}: dictionary 1 frequency - {value}, dictionary 2 frequency - 0")

    for key, value in dict_2.items():
        if key not in dict_1 and value > diff_percent:
            diff_codes.append(key)
            print(f"code {key}: dictionary 1 frequency - 0, dictionary 2 frequency - {value}")
    return diff_codes


In [16]:
all_codes = list(male_freq_dict)
female_codes = list(female_freq_dict)
all_codes.extend(female_codes)
num_codes = sum(male_freq_dict.values())*male_total + sum(female_freq_dict.values())*female_total
print("number of codes is:", num_codes)
all_codes = list(set(all_codes))
num_unique_codes = len(all_codes)
print("number of unique codes is:", num_unique_codes)
avg_percent = 1/num_unique_codes
print("avg percent is:", avg_percent)

number of codes is: 651047.0000000468
number of unique codes is: 6985
avg percent is: 0.00014316392269148176


In [19]:
# Find frequencies in female and male dictionaries differing by diff_percent
diff_codes = find_differ_codes(male_freq_dict, female_freq_dict, 10*avg_percent)
print("number of differing codes is:", len(diff_codes))

# report the statistics in REB application

code 285.9: dictionary 1 frequency - 0.007379270232349155, dictionary 2 frequency - 0.009466687942721161
code 414.01: dictionary 1 frequency - 0.022347308096247517, dictionary 2 frequency - 0.014992680574730985
code 411.1: dictionary 1 frequency - 0.003299584037400799, dictionary 2 frequency - 0.0016165176186544745
code 305.1: dictionary 1 frequency - 0.0058080397383487746, dictionary 2 frequency - 0.004339621332482291
code 599.0: dictionary 1 frequency - 0.006968545068128003, dictionary 2 frequency - 0.013969348605840277
code 274.9: dictionary 1 frequency - 0.004217513431264179, dictionary 2 frequency - 0.0019148449045005792
code V10.46: dictionary 1 frequency - 0.003327149484663963, dictionary 2 frequency - 0
code 428.33: dictionary 1 frequency - 0.0012156362243055574, dictionary 2 frequency - 0.002702290182257158
code 428.0: dictionary 1 frequency - 0.01930959580784678, dictionary 2 frequency - 0.021181237295073436
code V45.81: dictionary 1 frequency - 0.005954136608843546, dictiona

In [12]:
""" 
Add target, source rate to summary_df
"""

# Add column names
summary_df['male freq'] = [np.nan]*summary_df.shape[0]
summary_df['female freq'] = [np.nan]*summary_df.shape[0]
summary_df['male rate'] = [np.nan]*summary_df.shape[0]
summary_df['female rate'] = [np.nan]*summary_df.shape[0]

# Add frequencies
for index, row in summary_df.iterrows():
    if index in male_freq_dict:
        summary_df.at[index, 'male freq'] = male_freq_dict[index]
        summary_df.at[index, 'male rate'] = male_freq_dict[index]
    if index in female_freq_dict:
        summary_df.at[index, 'female freq'] = female_freq_dict[index]
        summary_df.at[index, 'female rate'] = female_freq_dict[index]

summary_df['male rate'] = summary_df.apply(lambda row: row['male rate']/male_admid_diagnosis_df.shape[0], axis=1)
summary_df['female rate'] = summary_df.apply(lambda row: row['female rate']/female_admid_diagnosis_df.shape[0], axis=1)
summary_df['male to female rate ratio'] = summary_df.apply(lambda row: row['male rate']/row['female rate'], axis=1)

# delete rows that have NaN

drop_indices = []
for index, row in summary_df.iterrows():
    if  math.isnan(row['male rate']) or math.isnan(row['female rate']):
        drop_indices.append(index)
summary_df = summary_df.drop(drop_indices)
# summary_df.to_csv("../../outputs/mimic/summary_mimic.csv", index=True, header=True)

NameError: name 'male_admid_diagnosis_df' is not defined