In [29]:
import pandas as pd
import numpy as np
import json
import sys

In [49]:
def read_files():
    data = pd.read_csv('../data/labeled_clustered_data_with_ids.csv')
    X_ICD9 = pd.read_csv('/Users/Aansh/Documents/Brown /Research/AIMed/data/patient_icd9.csv')
    icd9_d = pd.read_csv('/Users/Aansh/Documents/Brown /Research/AIMed/data/DIAGNOSES_ICD.csv')
    codes_file = open('/Users/Aansh/Documents/Brown /Research/AIMed/icd9/codes.json')
    codes = json.load(codes_file)
    return data, X_ICD9, icd9_d, codes_file, codes
def map_disease_to_parent(codes):
    """
    Maps each disease to parent in ICD9 hierarchy. In absence of a parent, maps to None.
    Returns mapping of parents to children and nodes to their parents
    """
    node_to_parent = {}
    parent_to_child = {} 
    for i in range(len(codes)):
        grouping = codes[i]
        parent = None
        for j in range(len(grouping)):
            if grouping[j]['depth'] == 1: 
                parent = grouping[j]['code']
        for j in range(len(grouping)):
            disease_classification = grouping[j]
            code = disease_classification['code']
            depth = disease_classification['depth']
            node_to_parent[code] = parent
            curr_children = parent_to_child.get(parent, [])
            curr_children.append(code)
            parent_to_child[parent] = curr_children
    return parent_to_child, node_to_parent

def generate_intervals(parent_to_child):
    """
    Generates intervals for ICD9 codes to group codes in
    """
    intervals = []
    def split_intervals(key):
        for i in range(len(key)):
            if key[i] == "-": 
                if key[:i-1] and key[i+1:]: return key[:i-1], key[i+1:]
        return None
    for key, _ in parent_to_child.items():
        if key: 
            key_interval = split_intervals(key)
            if key_interval:
                intervals.append((key_interval, key))
#     print(intervals)
    return intervals

def get_parent_cluster(code, intervals):
    """
    Returns the parent cluster for a particular code. 
    case 1: code begins with an E -> parent is None
    case 2: code begins with an V -> parent is None
    case 3: code is purely numerical then should exist within intervals
    """
    if code[0] == "E" or code[0] == "V":
        return None
    for interval in intervals:
        beginning = int(interval[0][0])
        end = int(interval[0][1])
        parent_cluster = interval[1]
        if int(code[:3]) >= beginning and int(code[:3]) <= end:
            return parent_cluster
    print("Bad Code:" + code[:3])
    return None    

def get_all_subject_IDs(X_ICD9):
    """
    Returns set of all subject IDs in the dataset
    """
    all_subjects = set()
    for index, row in X_ICD9.iterrows(): all_subjects.add(row['SUBJECT_ID'])
    return all_subjects
        
def get_patient_clusters(icd9_d, X_ICD9, parent_to_child):
    """
    Returns mapping of subjects to their parent cluster for most severe condition,
    and mapping of the number of patients within each cluster.
    """
    icd9 = set()
    patients_in_clusters = {}
    not_exist = set()
    intervals = generate_intervals(parent_to_child)
    hadm_ids = []
    clusters = []
    all_subjects = get_all_subject_IDs(X_ICD9)
    for index, row in icd9_d.iterrows():
        subject_id = row['SUBJECT_ID']
        hadm_id = row['HADM_ID']
        if subject_id not in all_subjects:
            continue
        if row["SEQ_NUM"] == 1.0:
            severe_code = row["ICD9_CODE"]
            icd9_parent = get_parent_cluster(severe_code, intervals)
            hadm_ids.append(hadm_id)
            clusters.append(icd9_parent)
            patients_in_clusters[icd9_parent] = patients_in_clusters.get(icd9_parent, 0) + 1
    dict_to_df = {"hadm_id": hadm_ids, "cluster": clusters}
    return patients_in_clusters, dict_to_df

def add_cluster_to_dataframe(X, dict_to_df):
    hadm_icd9_df = pd.DataFrame.from_dict(dict_to_df)
    X_with_clusters = X.merge(hadm_icd9_df, left_on='hadm_id', right_on='hadm_id')
    return X_with_clusters

def create_cluster_csv(X_with_clusters, patients_in_clusters):
    for c in X_with_clusters['cluster'].unique():
        X_file_name = 'X_cluster_{0}.csv'.format(c) 
        Y_file_name = 'Y_cluster_{0}.csv'.format(c) 
        X_copy = X_with_clusters.copy()
        Y = X_with_clusters[X_with_clusters['cluster'] == c]['label']
        X_copy = X_copy[X_copy['cluster'] == c].drop(['label'], axis=1)
        X_copy[X_copy['cluster'] == c].to_csv(X_file_name, sep=',', index=False)
        Y.to_csv(Y_file_name, sep=',', index=False)
    return True

In [50]:
X, X_ICD9, icd9_d, codes_file, codes = read_files()
parent_to_child, node_to_parent = map_disease_to_parent(codes)
patients_in_clusters, dict_to_df = get_patient_clusters(icd9_d, X_ICD9, parent_to_child)
X_with_clusters = add_cluster_to_dataframe(X, dict_to_df)

In [51]:
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array += 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

In [52]:
def generate_cluster_ids(icd9_ids):
    id_map = {}
    idx = 0
    for code in icd9_ids:
        if code not in id_map:
            id_map[code] = idx
            idx += 1
    return id_map, idx

def convert_list_to_id_map(id_map, codes, max_id):
    for i in range(len(codes)):
        code = codes[i]
        codes[i] = id_map.get(code, max_id)
        if codes[i] == max_id: max_id += 1
    return np.array(codes, dtype=float)

def filter_cluster_lists(cluster):
    for i in range(len(cluster)):
        if not cluster[i]:
            cluster[i] = 'None'
    return cluster

In [53]:
#to generate with different depths, depths in map_disease_to_parent to 1 or 2
code_ids, max_id = generate_cluster_ids(parent_to_child.keys())
cluster_0 = convert_list_to_id_map(code_ids, list(X_with_clusters[X_with_clusters['cluster_num']==0]['cluster']), max_id)
cluster_1 = convert_list_to_id_map(code_ids, list(X_with_clusters[X_with_clusters['cluster_num']==1]['cluster']), max_id)
cluster_2 = convert_list_to_id_map(code_ids, list(X_with_clusters[X_with_clusters['cluster_num']==2]['cluster']), max_id)
g_0 = gini(cluster_0)
g_1 = gini(cluster_1)
g_2 = gini(cluster_2)
print(g_0, g_1, g_2)
f = open('results/kmodes_clusters/gini_depth_1.csv', 'w')
f.write('cluster_0,cluster_1,cluster_2 \n {0},{1},{2}'.format(g_0,g_1,g_2))
f.close()

0.24594422815275316 0.37793898955408056 0.2839875692282057
