### Network Features (Layer 2)

In [81]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from statistics import mode

import sys
sys.path.append('..')

In [73]:
from apparent.utils import load_graphs
from apparent.curvature import forman_curvature
from apparent.build_networks import build_network,process_row

In [74]:
graphs_folder = '../outputs/all_graphs/'

In [75]:
def compute_stats(values,label):
        mean_val = np.mean(values)
        stdev_val = np.std(values)
        median_val = np.median(values)
        mode_val = mode(values)
        quantiles = np.percentile(values, [25, 50, 75])
        return {
            f'{label}_mean': mean_val,
            f'{label}_stdev': stdev_val,
            f'{label}_median': median_val,
            f'{label}_mode': mode_val,
            f'{label}_quantile_25': quantiles[0],
            f'{label}_quantile_50': quantiles[1],  # same as median
            f'{label}_quantile_75': quantiles[2]
        }

In [94]:
def unpack_data(data):
    new = {}
    pops = []
    new["nnodes"] = len(data['graph'].nodes())
    new["nedges"] = len(data['graph'].edges())
    for key in data:
        if isinstance(data[key], np.ndarray):
            new.update(compute_stats(data[key], key))
            pops.append(key)
        elif isinstance(data[key], dict):
            new.update(compute_stats(list(data[key].values()), key))
            pops.append(key)
    for key in pops:
        data.pop(key)

    data.pop('graph')
   
    data.update(new)
    return data

In [95]:
data_list = []
for FILE in os.listdir(graphs_folder):
    if FILE.endswith('.pkl'):
        with open(graphs_folder+FILE,'rb') as f:
            data = unpack_data(pickle.load(f))
            data_list.append(data)

In [96]:
df = pd.DataFrame(data_list)

In [97]:
df.columns = df.columns.str.lower()
df

Unnamed: 0,hsa,year,assortativity,average_clustering,density,nnodes,nedges,forman_mean,forman_stdev,forman_median,...,forman_quantile_25,forman_quantile_50,forman_quantile_75,centrality_mean,centrality_stdev,centrality_median,centrality_mode,centrality_quantile_25,centrality_quantile_50,centrality_quantile_75
0,13030,2016,-0.107466,0.446798,0.140592,44,133,-2.330827,6.694989,-2.0,...,-6.00,-2.0,2.0,0.140592,0.139090,0.069767,0.023256,0.023256,0.069767,0.209302
1,4002,2017,-0.590909,0.300000,0.333333,7,7,-0.428571,1.916630,0.0,...,-2.00,0.0,0.5,0.333333,0.218218,0.333333,0.166667,0.166667,0.333333,0.333333
2,14026,2014,-0.641791,0.542857,0.476190,7,10,1.100000,1.920937,2.0,...,0.00,2.0,2.0,0.476190,0.258638,0.500000,0.166667,0.250000,0.500000,0.666667
3,15054,2016,-0.192771,0.484392,0.287582,18,44,-1.863636,3.507077,-1.0,...,-4.00,-1.0,0.0,0.287582,0.160364,0.235294,0.235294,0.235294,0.235294,0.352941
4,17037,2017,-0.234387,0.808016,0.290472,47,314,-1.968153,12.181442,1.0,...,-11.00,1.0,6.0,0.290472,0.202666,0.239130,0.434783,0.152174,0.239130,0.402174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13488,5234,2016,-0.153023,0.519787,0.077148,59,132,-3.424242,9.199902,-0.5,...,-8.25,-0.5,3.0,0.077148,0.084546,0.034483,0.017241,0.017241,0.034483,0.103448
13489,23042,2017,0.061726,0.592018,0.075720,196,1447,-8.722184,20.632640,-6.0,...,-22.00,-6.0,7.0,0.075720,0.082795,0.041026,0.005128,0.010256,0.041026,0.119231
13490,20001,2014,-0.351251,0.755854,0.145236,171,2111,-29.800095,43.244776,-20.0,...,-53.00,-20.0,1.0,0.145236,0.156633,0.088235,0.011765,0.041176,0.088235,0.211765
13491,5136,2016,-0.426011,0.328911,0.038924,80,123,-23.024390,20.095893,-22.0,...,-45.00,-22.0,-2.0,0.038924,0.077742,0.025316,0.012658,0.012658,0.025316,0.037975


In [111]:
df.to_csv('../outputs/IPLDS_all_network_features.csv',index=False)

# Adding Meta Data

In [87]:
std_pricing = pd.read_csv('~/Downloads/physician_network_curvature_hsa_stdprices_ffs_wide.csv.gz')
std_pricing.head()

Unnamed: 0,hsa,year,ar_pmt_dme,ar_pmt_hha,ar_pmt_hospice,ar_pmt_hospitalsnf,ar_pmt_outpt,ar_pmt_physician,ar_pmt_total,ar_stdprice_dme,...,ptl_pmt_outpt,ptl_pmt_physician,ptl_pmt_total,ptl_stdprice_dme,ptl_stdprice_hha,ptl_stdprice_hospice,ptl_stdprice_hospitalsnf,ptl_stdprice_outpt,ptl_stdprice_physician,ptl_stdprice_total
0,1001,2011,362.54,872.74,567.81,4372.02,900.36,2941.67,9982.6,358.98,...,5.5,85.6,69.8,93.7,85.8,90.8,73.5,11.9,88.0,82.8
1,1001,2012,323.59,781.49,604.79,3936.28,1053.18,3046.06,9719.56,318.44,...,8.0,87.1,64.6,84.2,84.5,91.0,63.1,16.4,90.8,80.3
2,1001,2013,235.42,749.33,536.04,3828.31,1089.98,2965.23,9375.26,241.04,...,4.2,86.5,59.0,70.3,84.2,87.4,64.5,10.8,90.1,77.3
3,1001,2014,195.77,666.51,601.8,3750.91,1154.1,2957.52,9292.71,212.55,...,3.4,86.0,54.9,66.2,81.3,93.0,62.5,9.5,88.7,74.1
4,1001,2015,187.0,660.63,641.12,3883.66,1188.22,3120.97,9649.21,206.05,...,3.4,87.5,58.8,55.6,81.6,95.2,75.9,10.4,91.2,82.4


In [88]:
hedis = pd.read_csv('~/Downloads/physician_network_curvature_hsa_hedis_6575ffs_wide.csv.gz')
hedis.head()

Unnamed: 0,hsa,year,obd_overall_diab_bloodlip,obd_nonblk_diab_bloodlip,obd_blk_diab_bloodlip,obd_blk_diab_eye,obd_overall_diab_eye,obd_nonblk_diab_eye,obd_overall_diab_hgb,obd_blk_diab_hgb,...,ptl_nonblk_diab_eye,ptl_overall_diab_hgb,ptl_blk_diab_hgb,ptl_nonblk_diab_hgb,ptl_overall_mam_mam6769,ptl_nonblk_mam_mam6769,ptl_blk_mam_mam6769,ptl_nonblk_ptbjune_amcare2,ptl_blk_ptbjune_amcare2,ptl_overall_ptbjune_amcare2
0,1001,2008,764.0,713.0,51.0,50.0,643.0,593.0,823.0,60.0,...,0.434,0.658,0.508,0.697,0.598,0.56,0.786,0.677,0.595,0.65
1,1001,2009,790.0,729.0,61.0,55.0,671.0,616.0,817.0,66.0,...,0.613,0.544,0.473,0.566,0.555,0.547,0.424,0.73,0.628,0.706
2,1001,2010,834.0,765.0,69.0,54.0,679.0,625.0,857.0,73.0,...,0.496,0.454,0.325,0.48,0.612,0.607,0.361,0.708,0.693,0.694
3,1001,2011,882.0,809.0,73.0,61.0,720.0,659.0,908.0,73.0,...,0.588,0.619,0.365,0.66,0.559,0.561,0.218,0.742,0.683,0.739
4,1001,2012,887.0,827.0,60.0,47.0,731.0,684.0,910.0,64.0,...,0.717,0.728,0.787,0.721,0.634,0.614,0.608,0.636,0.369,0.611


In [91]:
postdis = pd.read_csv('~/Downloads/physician_network_curvature_hsa_postdis_6599ffs_wide.csv.gz')
postdis.head()

Unnamed: 0,hsa,year,obd_pt_asr_ami_anyac,obd_pt_asr_ami_aoervis,obd_pt_asr_ami_mdac,obd_pt_asr_ami_readmit,obd_pt_asr_chf_anyac,obd_pt_asr_chf_aoervis,obd_pt_asr_chf_mdac,obd_pt_asr_chf_readmit,...,ptl_pt_asr_med_aoervis,ptl_pt_asr_med_mdac,ptl_pt_asr_med_readmit,ptl_pt_asr_pn_anyac,ptl_pt_asr_pn_aoervis,ptl_pt_asr_pn_mdac,ptl_pt_asr_pn_readmit,ptl_pt_asr_sur_aoervis,ptl_pt_asr_sur_mdac,ptl_pt_asr_sur_readmit
0,1001,2009,,,,,28.0,,,,...,0.408,0.51,0.149,0.103,,0.269,,0.917,0.515,0.897
1,1001,2010,,,,,36.0,,,,...,0.435,0.305,0.296,0.772,,0.81,,0.405,0.35,0.419
2,1001,2011,22.0,,,,21.0,,,,...,0.353,0.366,0.291,0.225,,0.421,,0.859,0.161,0.812
3,1001,2012,,,,,25.0,,,,...,0.085,0.382,0.252,0.117,,,,0.756,0.169,0.832
4,1001,2013,,,,,42.0,,31.0,,...,0.147,0.338,0.404,0.766,,0.773,,0.449,0.103,0.756


In [103]:
census = pd.read_csv('~/Downloads/census_data.csv.gz')
census["hsa"] = census["hsanum"]
census.drop(columns=["hsanum"],inplace=True)
census.head()

Unnamed: 0,year,race_total_pop,race_black_pop,hispanic_total_pop,hispanic_pop,median_hh_income,employment_pop,employment_unemployed_pop,education_pop,education_nohs_pop,hsa
0,2014,8870,3798,8870,3585,707713.0,55371,4504,75280,12481,1001
1,2015,111839,15326,111839,9512,625475.0,54125,3888,73975,12092,1001
2,2016,114166,15320,114166,8913,691244.0,89256,3405,75983,11698,1001
3,2017,113846,15725,113846,8875,665537.0,53481,2985,75356,11063,1001
4,2014,806,571,806,130,33497.0,8786,926,14443,3529,1002


In [104]:
meta = pd.merge(std_pricing,hedis,on=['hsa', 'year'], how='inner')
meta = pd.merge(meta,postdis,on=['hsa', 'year'], how='inner')
meta = pd.merge(meta,census,on=['hsa','year'], how='inner')
meta.head()

Unnamed: 0,hsa,year,ar_pmt_dme,ar_pmt_hha,ar_pmt_hospice,ar_pmt_hospitalsnf,ar_pmt_outpt,ar_pmt_physician,ar_pmt_total,ar_stdprice_dme,...,ptl_pt_asr_sur_readmit,race_total_pop,race_black_pop,hispanic_total_pop,hispanic_pop,median_hh_income,employment_pop,employment_unemployed_pop,education_pop,education_nohs_pop
0,1001,2014,195.77,666.51,601.8,3750.91,1154.1,2957.52,9292.71,212.55,...,0.746,8870,3798,8870,3585,707713.0,55371,4504,75280,12481
1,1001,2015,187.0,660.63,641.12,3883.66,1188.22,3120.97,9649.21,206.05,...,0.945,111839,15326,111839,9512,625475.0,54125,3888,73975,12092
2,1001,2016,177.9,710.86,709.31,4099.82,1260.06,3370.97,10291.57,182.45,...,0.927,114166,15320,114166,8913,691244.0,89256,3405,75983,11698
3,1001,2017,214.62,690.54,639.84,4173.35,1475.76,3482.0,10659.04,209.84,...,0.923,113846,15725,113846,8875,665537.0,53481,2985,75356,11063
4,1002,2014,323.51,596.86,518.81,3078.68,2060.83,2519.96,9092.06,294.78,...,,806,571,806,130,33497.0,8786,926,14443,3529


In [107]:
main_table = pd.merge(df,meta,on=['hsa','year'], how='left').sort_values(by=['year','hsa'],ignore_index=True)
main_table

Unnamed: 0,hsa,year,assortativity,average_clustering,density,nnodes,nedges,forman_mean,forman_stdev,forman_median,...,ptl_pt_asr_sur_readmit,race_total_pop,race_black_pop,hispanic_total_pop,hispanic_pop,median_hh_income,employment_pop,employment_unemployed_pop,education_pop,education_nohs_pop
0,1001,2014,-0.429306,0.757948,0.308016,98,1464,-2.997268,29.105001,-2.0,...,0.746,8870,3798,8870,3585,707713.0,55371,4504,75280,12481
1,1002,2014,-0.276127,0.853843,0.620120,37,413,13.883777,12.185454,16.0,...,,806,571,806,130,33497.0,8786,926,14443,3529
2,1003,2014,-0.350290,0.708825,0.476667,25,143,3.727273,7.111678,4.0,...,0.026,2129,828,2129,320,269485.0,11093,1103,17478,3138
3,1004,2014,-0.410506,0.802295,0.356764,117,2421,4.125568,36.906872,10.0,...,0.546,7236,3430,7236,2019,538347.0,46212,6074,69954,13880
4,1006,2014,-0.458333,0.595833,0.500000,8,14,2.214286,2.540488,3.0,...,,1217,397,1217,241,103134.0,5044,576,8356,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13488,53021,2017,-0.232525,0.840009,0.606531,50,743,21.923284,15.779803,25.0,...,0.039,29043,299,29043,1259,400825.0,14658,470,20385,968
13489,53023,2017,-0.283019,0.952381,0.944444,9,34,7.558824,1.479093,7.0,...,,4702,0,4702,92,94219.0,2357,97,3574,270
13490,53024,2017,,1.000000,1.000000,7,21,7.000000,0.000000,7.0,...,,12861,9,12861,1358,213259.0,6265,409,9083,800
13491,53025,2017,,1.000000,1.000000,10,45,10.000000,0.000000,10.0,...,,8847,3,8847,678,189767.0,4485,110,6399,479


In [108]:
main_table.to_csv('../outputs/IPLDS_all_network_features_with_meta.csv',index=False)

In [109]:
main_table.year.unique()

array([2014, 2015, 2016, 2017])

In [110]:
main2014 = main_table[main_table.year == 2014]
main2015 = main_table[main_table.year == 2015]
main2016 = main_table[main_table.year == 2016]
main2017 = main_table[main_table.year == 2017]