In [1]:
import io
import multiprocessing as mul
import os
from pathlib import Path
import requests
import zipfile

from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as figf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import synapseclient
from synapseclient import Activity, Project, Folder, File, Table, Schema, as_table_columns

# set the user's home directory as the data directory
data_dir = os.path.join(str(Path.home()), '.gscap')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
syn = synapseclient.Synapse()
syn.login()

init_notebook_mode(connected=True)
InteractiveShell.ast_node_interactivity = 'all'

def isnum(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

def isstr(x):
    try:
        str(x)
        return True
    except ValueError:
        return False

Welcome, Luke Waninger!



In [2]:
fips = pd.read_csv(syn.get('syn16975424').path)

# states
state_abbr_syn = syn.get('syn16816613')
states = pd.read_csv(state_abbr_syn.path)
states.columns = ['state', 'stabbr']

states = states.merge(
    fips.loc[fips.summary_level=='state', ['name', 'state_fips']], 
    left_on='state', 
    right_on='name', 
    how='left'
).drop(columns='name').rename(columns={'state_fips':'fips'})

# counties
counties = fips.loc[fips.summary_level == 'county'].merge(
    states,
    left_on='state_fips',
    right_on='fips'
).drop(columns=['county_subdivision_fips', 'place_fips', 'city_fips', 'summary_level', 'fips'])

counties['fips'] = [str(x.state_fips).zfill(2)+str(x.county_fips).zfill(3) for x in counties.itertuples()]
counties = counties.drop(columns=['state_fips', 'county_fips'])

In [3]:
hr_syn = syn.get('syn16971220')
hr = pd.read_excel(hr_syn.path, sheet_name=[
    'Outcomes & Factors Rankings',
    'Outcomes & Factors SubRankings',
    'Ranked Measure Data',
    'Additional Measure Data'
])

hr['Outcomes & Factors Rankings'].reset_index(inplace=True)
hr['Outcomes & Factors SubRankings'].reset_index(inplace=True)
hr['Ranked Measure Data'].reset_index(inplace=True)
hr['Additional Measure Data'].reset_index(inplace=True)

### Outcomes & Factors Rankings

In [None]:
cols = [
    'fips', 'state', 'county', 'num_of_ranked_counties', 'health_outcomes_rank', 
    'health_outcomes_quartile', 'health_factors_rank', 'health_factors_quartile'
]
hr['Outcomes & Factors Rankings'].columns = cols
hr['Outcomes & Factors Rankings'] = hr['Outcomes & Factors Rankings'].loc[1:, [c for c in cols if c not in ['state', 'county']]]

county_rankings = counties.merge(hr['Outcomes & Factors Rankings'], on='fips', how='left')

In [None]:
outcomes_and_factors_syn = syn.setProvenance(
    syn.store(Table(
        Schema(
            name='County Health Rankings (Summarized)', 
            columns=as_table_columns(county_rankings), parent='syn16816579'), county_rankings)
    ),
    activity=Activity(
        name='County Health Rankings',
        description='Overall inner-state county health rankings.',
        used=[           
            dict(
                name='County Health Rankings and Roadmaps',
                url='http://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation'
            )
        ]
    )
)

### Outcomes & Factors SubRankings

In [None]:
cols = [
    'fips', 'state', 'county', 'num_of_ranked_counties',
    'length_of_life_rank', 'length_of_life_quartile',
    'quality_of_life_rank', 'quality_of_life_quartile', 
    'health_behaviors_rank', 'health_behaviors_quartile',
    'clinical_care_rank', 'clinical_care_quartile', 
    'social_and_economic_factors_rank', 'social_and_economic_factors_quartile', 
    'physical_environment_rank', 'physical_environment_quartile'
]
hr['Outcomes & Factors SubRankings'].columns = cols
hr['Outcomes & Factors SubRankings'] = hr['Outcomes & Factors SubRankings'].loc[1:, [c for c in cols if c not in ['state', 'county']]]

county_rankings_sub = counties.merge(hr['Outcomes & Factors SubRankings'], on='fips', how='left')

In [None]:
subrankings = syn.setProvenance(
    syn.store(Table(
        Schema(
            name='County Health Rankings (SubMeasures)', 
            columns=as_table_columns(county_rankings_sub), parent='syn16816579'), county_rankings_sub)
    ),
    activity=Activity(
        name='Parse Into Synapse Table',
        description='Extract Excel sheets from original data source into Synapse table.',
        used=[           
            dict(
                name='County Health Rankings and Roadmaps',
                url='http://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation'
            )
        ]
    )
)

### Ranked Measure Data

In [4]:
base_cols = [
    'fips', 'state', 'county'
]
health_cols = [
    'premature_death_yopllr', 'premature_death_yopllr_cilow', 'premature_death_yopllr_ciup', 'premature_death_yopllr_quartile',
    'premature_death_yopllr_black', 'premature_death_yopllr_hispanic', 'premature_death_yopllr_white',
    'poor_or_fair_health', 'poor_or_fair_health_cilow', 'poor_or_fair_health_ciup', 'poor_or_fair_health_quartile',
    'physically_unhealthy_days', 'physically_unhealthy_days_cilow', 'physically_unhealthy_days_ciup', 'physically_unhealthy_days_quartile',
    'mentally_unhealthy_days', 'mentally_unhealthy_days_cilow', 'mentally_unhealthy_days_ciup', 'mentally_unhealthy_days_quartile',
    'low_birthweight_unreliable', 'low_birthweight', 'low_birthweight_cilow', 'low_birthweight_ciup', 'low_birthweight_quartile',
    'low_birthweight_black', 'low_birthweight_hispanic', 'low_birthweight_white',
    'adult_smokers', 'adult_smokers_cilow', 'adult_smokers_ciup', 'adult_smokers_quartile',
    'adult_obesity', 'adult_obesity_cilow', 'adult_obesity_ciup', 'adult_obesity_quartile',
    'food_environment_index', 'food_environment_index_quartile',
    'physically_inactive', 'physically_inactive_cilow', 'physically_inactive_ciup', 'physically_inactive_quartile',
    'access_to_exercise', 'access_to_exercise_quartile',
    'excessive_drinking', 'excessive_drinking_cilow', 'excessive_drinking_ciup', 'excessive_drinking_quartile',
    'num_alchohol_impaired_driving_deaths', 'num_driving_deaths', 'perc_alchohol_impaired', 'perc_alchohol_impaired_cilow', 'perc_alchohol_impaired_ciup', 'perc_alchohol_impaired_quartile',
    'num_chlamydia_cases', 'chlamydia_rate', 'chlamydia_quartile',
    'teen_birth_rate', 'teen_birth_rate_cilow', 'teen_birth_rate_ciup', 'teen_birth_rate_quartile', 
    'teen_birth_rate_black', 'teen_birth_rate_white', 'teen_birth_rate_hispanic',
    'num_uninsured', 'perc_uninsured', 'perc_uninsured_cilow', 'perc_uninsured_ciup', 'perc_uninsured_quartile',
    'num_primary_care_physicians', 'pcp_rate', 'pcp_ratio', 'pcp_quartile',
    'num_dentists', 'dentist_rate', 'dentist_ratio', 'dentist_quartile',
    'num_mental_health_providers', 'mhp_rate', 'mhp_ratio', 'mhp_quartile',
    'preventable_hospital_rate_num_medicare_enrollees', 'preventable_hospital_rate', 'preventable_hospital_rate_cilow', 'preventable_hospital_rate_ciup', 'preventable_hospital_rate_quartile',
    'num_diabetics', 'perc_of_diabetics_receiving_hba1c', 'perc_of_diabetics_receiving_hba1c_cilow', 'perc_of_diabetics_receiving_hba1c_ciup', 'perc_of_diabetics_receiving_hba1c_quartile',
    'perc_of_diabetics_receiving_hba1c_black', 'perc_of_diabetics_receiving_hba1c_white',
    'mammography_screening_num_medicare_enrollees', 'perc_mammography', 'perc_mammography_cilow', 'perc_mammography_ciup', 'perc_mammography_quartile',
    'perc_mammography_black', 'perc_mammography_white'
]
education_cols = [
    'high_school_grad_cohort_size', 'high_school_grad_rate', 'high_school_grad_quartile',
    'num_some_college', 'population', 'perc_some_college', 'perc_some_college_cilow', 'perc_some_college_ciup', 'perc_some_college_quartile',
]
social_factor_cols = [
    'num_unemployed', 'labor_force', 'perc_unemployed', 'perc_unemployed_quartile',
    'perc_children_in_poverty', 'perc_children_in_poverty_cilow', 'perc_children_in_poverty_ciup', 'perc_children_in_poverty_quartile',
    'perc_children_in_poverty_black', 'perc_children_in_poverty_hispanic', 'perc_children_in_poverty_white',
    '80th_percentile_income', '20th_percentile_income', 'income_inequality_ratio', 'income_inquality_quartile',
    'num_single_parent_households', 'num_households', 'perc_single_parent_households', 'perc_single_parent_households_cilow', 'perc_single_parent_households_ciup', 'perc_single_parent_households_quartile',
    'num_social_associations', 'social_association_rate', 'social_association_quartile',
    'num_violent_crimes', 'violent_crime_rate', 'violent_crime_quartile',
    'num_injury_deaths', 'injury_death_rate', 'injury_death_rate_cilow', 'injury_death_rate_ciup', 'injury_death_rate_quartile',
    'average_daily_pm2p5', 'average_daily_pm2p5_quartile',
    'presence_of_drinking_water_violation', 'presence_of_drinking_water_violation_quartile',
    'num_households_with_severe_housing_problems', 'perc_of_households_with_severe_housing_problems', 'perc_of_households_with_severe_housing_problems_cilow',
    'perc_of_households_with_severe_housing_problems_ciup', 'perc_of_households_with_severe_housing_problems_quartile',
    'perc_drive_alone_to_work', 'perc_drive_alone_to_work_cilow', 'perc_drive_alone_to_work_ciup', 'perc_drive_alone_to_work_quartile',
    'perc_drive_alone_to_work_black', 'perc_drive_alone_to_work_hispanic', 'perc_drive_alone_to_work_white',
    'num_of_workers_who_drive_alone', 'perc_of_long_commutes_alone', 'perc_of_long_commutes_alone_cilow', 'perc_of_long_commutes_alone_ciup', 'perc_of_long_commutes_alone_quartile'
]
all_cols = base_cols + health_cols + education_cols + social_factor_cols

hr['Ranked Measure Data'].columns = all_cols
hr['Ranked Measure Data'] = hr['Ranked Measure Data'].loc[1:, [c for c in all_cols if c not in ['state', 'name']]]

chr_measures = counties.merge(hr['Ranked Measure Data'], on='fips', how='left')

In [5]:
def fx(x):
    if isnum(x):
        return np.round(x, 2)
    else:
        return x
    
for c in chr_measures.columns:
    if c in ['name', 'state', 'stabbr', 'fips', 'county']:
        continue
    
    chr_measures[c] = chr_measures[c].apply(fx)

In [6]:
#chr_health = chr_measures.loc[:, base_cols + health_cols]
chr_education = chr_measures.loc[:, base_cols + education_cols]
#chr_social = chr_measures.loc[:, base_cols + social_factor_cols]

# chr_health = syn.setProvenance(
#     syn.store(Table(
#         Schema(
#             name='County Health Rankings (Health Measures)', 
#             columns=as_table_columns(chr_health), parent='syn16816579'), chr_health)
#     ),
#     activity=Activity(
#         name='Parse Into Synapse Table',
#         description='Extract Excel sheets from original data source into Synapse table.',
#         used=[           
#             dict(
#                 name='County Health Rankings and Roadmaps',
#                 url='http://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation'
#             )
#         ]
#     )
# )
chr_education = syn.setProvenance(
    syn.store(Table(
        Schema(
            name='County Health Rankings (Education Measures)', 
            columns=as_table_columns(chr_education), parent='syn16816579'), chr_education)
    ),
    activity=Activity(
        name='Parse Into Synapse Table',
        description='Extract Excel sheets from original data source into Synapse table.',
        used=[           
            dict(
                name='County Health Rankings and Roadmaps',
                url='http://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation'
            )
        ]
    )
)
# chr_social = syn.setProvenance(
#     syn.store(Table(
#         Schema(
#             name='County Health Rankings (Social Measures)', 
#             columns=as_table_columns(chr_social), parent='syn16816579'), chr_social)
#     ),
#     activity=Activity(
#         name='Parse Into Synapse Table',
#         description='Extract Excel sheets from original data source into Synapse table.',
#         used=[           
#             dict(
#                 name='County Health Rankings and Roadmaps',
#                 url='http://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation'
#             )
#         ]
#     )
# )