In [None]:
from collections import defaultdict
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
from scipy.stats import chi2_contingency, mannwhitneyu
from math import log10, log2
import itertools

import networkx as nx
from tqdm import tqdm
import pickle

pd.set_option('display.max_rows', 500)
np.set_printoptions(threshold=500)
pd.options.mode.chained_assignment = None  # default='warn'

import warnings
warnings.filterwarnings("default", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 

In [None]:
basedir = os.getcwd()
datadir = os.path.join(basedir,'Data')
diagkeys = ['DiagnosisName','Level3_Category','Level2_Category']

%run -i setupfunc.py

In [None]:
# parameters
total_alz = 8804 #Total alzheimer patients
total_con = 17608 #Total control patients
diagkeys = ['DiagnosisName','level3_diagnosis','level2_diagnosis']
cutoff = .01

# Alzheimer Patients

In [None]:
# get the diagnosis
alzdiag = pd.read_csv(os.path.join(datadir,'ad_diagnosis.csv'))
alzdiag['ValueL'] = alzdiag['Value'].apply(lambda x: ICD10_code_to_chapter(str(x)[0:3])) # Get ICD10 Chapte
alzdiagcount = countPtsDiagnosis_Dict(alzdiag, total_alz)
numsexalz = {'Female': alzdiag[alzdiag['Sex']=='Female'][['PatientID','Sex']].drop_duplicates().shape[0], 
          'Male':alzdiag[alzdiag['Sex']=='Male'][['PatientID','Sex']].drop_duplicates().shape[0]}

In [None]:
n = 'DiagnosisName' # 'DiagnosisName', 'level3_diagnosis','level2_diagnosis'

In [None]:
# Initialize graph
AlzGraph = nx.Graph()
# Add nodes to graph
alz_nodes = alzdiagcount[n][n]
AlzGraph.add_nodes_from(alz_nodes)

In [None]:
# For each disease, get sex distribution and unique patients
print('Count number of patients per node...')
diagtemp = alzdiag[['PatientID',n,'Sex']].drop_duplicates()
diagtemp = pd.pivot_table(diagtemp, values = ['PatientID','Sex'], index = n,
                      aggfunc={ 'PatientID': lambda x: len(x.unique()), 
                                  'Sex': lambda x: dict(x.value_counts()) })
diagtemp = diagtemp.sort_values('PatientID', ascending = False)

## Add stats info
#with open('alzcon_diagnosis_stats.pickle', 'rb') as handle:
#    statsall = pickle.load(handle)
#alz_node_attr = alz_node_attr.merge(statsall[n], left_index=True, right_index = True, how = 'left')

# Add sex information
print('Set Sex as Attributes...')
alz_node_attr = diagtemp[diagtemp.index.isin(alz_nodes)]

alz_node_attr['Females'] = alz_node_attr['Sex'].apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female'])
alz_node_attr['pFemale'] = alz_node_attr['Sex']\
    .apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female']*100/numsexalz['Female'])
alz_node_attr['Males'] = alz_node_attr['Sex'].apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male'])
alz_node_attr['pMale'] = alz_node_attr['Sex']\
    .apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male']*100/numsexalz['Male'])

alz_node_attr = alz_node_attr.drop('Sex', axis = 1)
alz_node_attr = alz_node_attr.rename(columns = {'PatientID':'PtCount'})

## Add stats info
#with open('alzcon_diagnosis_stats.pickle', 'rb') as handle:
#    statsall = pickle.load(handle)
#alz_node_attr = alz_node_attr.merge(statsall[n], left_index=True, right_index = True, how = 'left')

print('Adding MF stats information...')
with open('alzcon_diagnosis_MF_stats.pickle', 'rb') as handle:
    statsMF = pickle.load(handle)
alz_node_attr = alz_node_attr.merge(statsMF[n], left_index=True, right_index = True, how = 'left')

# expand into 'ValueL2' if diagnosis is part of multiple blocks
if((alz_node_attr['ValueL'].apply(lambda x: len(x) if isinstance(x,list) else 1).sum()) == alz_node_attr.shape[0]):
    alz_node_attr['ValueL']=alz_node_attr['ValueL'].apply(lambda x: x[0] if isinstance(x,list) else x)
else:
    alz_node_attr['ValueL2'] = alz_node_attr['ValueL'].apply(lambda x: ",".join(x[1:]) if(isinstance(x,list) and len(x)>1) else np.nan)
    alz_node_attr['ValueL']= alz_node_attr['ValueL'].apply(lambda x: x[0] if isinstance(x,list) else np.nan)
    
alz_node_attr = alz_node_attr.to_dict(orient = 'index') # Make the columns into a dictionary for node attributes.
nx.set_node_attributes(AlzGraph, alz_node_attr)

In [None]:
print('Create all edges...')

# make a dataframe of edges
diagtemp = alzdiag[['PatientID',n,'Sex']].drop_duplicates()
diagtemp = diagtemp[diagtemp[n].isin(alz_nodes)]
grouped = diagtemp.groupby('PatientID')

alz_edges = []
for k, pt in tqdm(list(grouped)):
    combo_list = list(itertools.combinations(pt[n].sort_values(), r=2))
    combo_list = [(item,) for item in combo_list]
    combo_df = pd.DataFrame(combo_list, columns = [n+ 'Combo']).drop_duplicates()
    df_len = combo_df.shape[0]
    combo_df['PatientID'] = pt['PatientID'].values[0];
    combo_df['Sex'] = pt['Sex'].values[0];
    alz_edges.append(combo_df)

alz_edges = pd.concat(alz_edges).reset_index(drop = True)

# Add to graph
AlzGraph.add_edges_from(alz_edges[n+'Combo'])

In [None]:
# Count the number of patient for each edge. 
# Can take at least an hour to run
diagtemp = alz_edges
diagtemp = pd.pivot_table(diagtemp, values = ['PatientID','Sex'], index = n+'Combo',
                      aggfunc={
                              'PatientID': lambda x: len(x.unique()), 
                              'Sex': lambda x: dict(x.value_counts())
                              })

print('sorting...')
diagtemp = diagtemp.sort_values('PatientID', ascending = False)

In [None]:
# Add edge attributes
alz_edge_attr = diagtemp;
alz_edge_attr['Females'] = alz_edge_attr['Sex'].apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female'])
alz_edge_attr['pFemale'] = alz_edge_attr['Sex']\
    .apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female']*100/numsexalz['Female'])
alz_edge_attr['Males'] = alz_edge_attr['Sex'].apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male'])
alz_edge_attr['pMale'] = alz_edge_attr['Sex']\
    .apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male']*100/numsexalz['Male'])

alz_edge_attr = alz_edge_attr.drop('Sex', axis = 1)
alz_edge_attr = alz_edge_attr.rename(columns = {'PatientID':'PtCount'})

print('Keeping edges with ',cutoff*100,'% of patients.')
alz_edge_attr = alz_edge_attr[alz_edge_attr['PtCount']>(total_alz * cutoff)]

# Make dictionary
alz_edge_attr = alz_edge_attr.to_dict(orient = 'index')
nx.set_edge_attributes(AlzGraph, alz_edge_attr)

print(nx.info(AlzGraph))

nx.write_graphml(AlzGraph, n+'graph.graphml')

# Control patients

In [None]:
# Get all control patient diagnosis. 
condiag = pd.read_csv(os.path.join(datadir,'control_diagnosis.csv'))
condiag['ValueL'] = condiag['Value'].apply(lambda x: ICD10_code_to_chapter(str(x)[0:3])) # Get ICD10 Chapte
condiagcount = countPtsDiagnosis_Dict(condiag, total_con)
numsexcon = {'Female': condiag[condiag['Sex']=='Female'][['PatientID','Sex']].drop_duplicates().shape[0], 
          'Male':condiag[condiag['Sex']=='Male'][['PatientID','Sex']].drop_duplicates().shape[0]}

In [None]:
n = 'DiagnosisName' # 'DiagnosisName', 'level3_diagnosis','level2_diagnosis'

In [None]:
# Initialize graph
ConGraph = nx.Graph()

# Add nodes to graph
con_nodes = condiagcount[n][n]
ConGraph.add_nodes_from(con_nodes)

In [None]:
# For each disease, get sex distribution and unique patients
print('Count number of patients per node...')
diagtemp = condiag[['PatientID',n,'Sex']].drop_duplicates()
diagtemp = pd.pivot_table(diagtemp, values = ['PatientID','Sex'], index = n,
                      aggfunc={ 'PatientID': lambda x: len(x.unique()), 
                              'Sex': lambda x: dict(x.value_counts()) })
diagtemp = diagtemp.sort_values('PatientID', ascending = False)

## Add stats
#con_node_attr = con_node_attr.merge(statsall[n], left_index=True, right_index = True, how = 'left')

# Node Attributes
print('Set Sex as Attributes...')
con_node_attr = diagtemp[diagtemp.index.isin(con_nodes)]

con_node_attr['Females'] = con_node_attr['Sex'].apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female'])
con_node_attr['pFemale'] = con_node_attr['Sex']\
    .apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female']*100/numsexcon['Female'])
con_node_attr['Males'] = con_node_attr['Sex'].apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male'])
con_node_attr['pMale'] = con_node_attr['Sex']\
    .apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male']*100/numsexcon['Male'])
con_node_attr = con_node_attr.drop('Sex', axis = 1)
con_node_attr = con_node_attr.rename(columns = {'PatientID':'PtCount'})

print('Adding MF stats information')
con_node_attr = con_node_attr.merge(statsMF[n], left_index=True, right_index = True, how = 'left')

# Expand diagnosis to ValueL2 if part of multiple categories
if((con_node_attr['ValueL'].apply(lambda x: len(x) if isinstance(x,list) else 1).sum()) == con_node_attr.shape[0]):
    con_node_attr['ValueL']=con_node_attr['ValueL'].apply(lambda x: x[0] if isinstance(x,list) else x)
else:
    con_node_attr['ValueL2'] = con_node_attr['ValueL'].apply(lambda x: ",".join(x[1:]) if(isinstance(x,list) and len(x)>1) else np.nan)
    con_node_attr['ValueL']= con_node_attr['ValueL'].apply(lambda x: x[0] if isinstance(x,list) else np.nan)

In [None]:
con_node_attr = con_node_attr.to_dict(orient = 'index') # Make the columns into a dictionary for node attributes.
nx.set_node_attributes(ConGraph, con_node_attr)

In [None]:
print('Create all edges...')
# make a dataframe of edges
diagtemp = condiag[['PatientID',n,'Sex']].drop_duplicates()
diagtemp = diagtemp[diagtemp[n].isin(con_nodes)]
grouped = diagtemp.groupby('PatientID')

con_edges = []
for k, pt in tqdm(list(grouped)):
    # print(k)
    combo_list = list(itertools.combinations(pt[n].sort_values(), r=2))
    combo_list = [(item,) for item in combo_list]
    combo_df = pd.DataFrame(combo_list, columns = [n+ 'Combo']).drop_duplicates()
    df_len = combo_df.shape[0]
    combo_df['PatientID'] = pt['PatientID'].values[0];
    combo_df['Sex'] = pt['Sex'].values[0];
    con_edges.append(combo_df)

con_edges = pd.concat(con_edges).reset_index(drop = True)

# Add to graph
ConGraph.add_edges_from(con_edges[n+'Combo'])

In [None]:
# Count the number of patient for each edge.
diagtemp = con_edges
diagtemp = pd.pivot_table(diagtemp, values = ['PatientID','Sex'], index = n+'Combo',
                      aggfunc={
                              'PatientID': lambda x: len(x.unique()), 
                              'Sex': lambda x: dict(x.value_counts())
                              })

print('sorting...')
diagtemp = diagtemp.sort_values('PatientID', ascending = False)

In [None]:
con_edge_attr = diagtemp;
con_edge_attr['Females'] = con_edge_attr['Sex'].apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female'])
con_edge_attr['pFemale'] = con_edge_attr['Sex']\
    .apply(lambda lst: 0 if ('Female' not in list(lst.keys())) else lst['Female']*100/numsexcon['Female'])
con_edge_attr['Males'] = con_edge_attr['Sex'].apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male'])
con_edge_attr['pMale'] = con_edge_attr['Sex']\
    .apply(lambda lst: 0 if ('Male' not in list(lst.keys())) else lst['Male']*100/numsexcon['Male'])
                                                    
con_edge_attr = con_edge_attr.drop('Sex', axis = 1)
con_edge_attr = con_edge_attr.rename(columns = {'PatientID':'PtCount'})

print('Keeping edges with ',cutoff*100,'% of patients.')
con_edge_attr = con_edge_attr[con_edge_attr['PtCount']>(total_con * cutoff)]

# Make dictionary
con_edge_attr = con_edge_attr.to_dict(orient = 'index')
# Add to attributes
nx.set_edge_attributes(ConGraph, con_edge_attr)

print(nx.info(ConGraph))
nx.write_graphml(ConGraph, n+'graph_control.graphml')