In [0]:
import pandas as pd
import requests
from pandas.io.json import json_normalize

In [0]:
# Grab NYC Childhood Blood Lead Testing and Elevated Incidence by Zip Code Data
url1 = 'https://health.data.ny.gov/resource/rzpj-maap.json'
url2 = 'https://data.cityofnewyork.us/resource/kku6-nxdu.json'
r1 = requests.get(url1)
r2 = requests.get(url2)
df1 = json_normalize(r1.json())
df2 = pd.DataFrame(r2.json()[1:], columns=r2.json()[0])
test = pd.read_csv('https://docs.google.com/spreadsheets/d/' + 
                   '1V-yf8UW3Ui189qzhiT4t8MYNEkNHtd9W8S_sN95N4Lo' +
                   '/export?gid=1060511533&format=csv'
                  )
# df = pd.read_csv(BytesIO(data), index_col=0)
# print(df2.jurisdiction_name.head())

In [0]:
#Let's see what our NYC Childhood Blood Lead data looks like: 
print(len(df1.index))
df1.head(3)

1000


Unnamed: 0,_10to15,_15,_5_10_mcg_dl,county,county_location.coordinates,county_location.type,fips,less_than_5_mcg_dl,percent,rate_per_1_000,tests,total_eblls,year,zip,zip_code_location.coordinates,zip_code_location.type
0,,,,Albany,"[-73.9740136, 42.5882713]",Point,1,,,,,,2000,11220,"[-74.01819492, 40.64057676]",Point
1,,,,Albany,"[-73.9740136, 42.5882713]",Point,1,,,,,,2001,11220,"[-74.01819492, 40.64057676]",Point
2,,,,Albany,"[-73.9740136, 42.5882713]",Point,1,,,,,,2005,11510,"[-73.60464708, 40.65343524]",Point


In [0]:
# Lets peep into our NYC Zipcode Data
print(df2.columns)
df2.head(3)

Unnamed: 0,count_american_indian,count_asian_non_hispanic,count_black_non_hispanic,count_citizen_status_total,count_citizen_status_unknown,count_ethnicity_total,count_ethnicity_unknown,count_female,count_gender_total,count_gender_unknown,...,percent_nreceives_public_assistance,percent_other_citizen_status,percent_other_ethnicity,percent_pacific_islander,percent_permanent_resident_alien,percent_public_assistance_total,percent_public_assistance_unknown,percent_receives_public_assistance,percent_us_citizen,percent_white_non_hispanic
0,0,28,0,35,0,35,0,19,35,0,...,0.94,0,0,0,0.06,100,0,0.06,0.94,0.17
1,0,1,0,1,0,1,0,1,1,0,...,1.0,0,0,0,0.0,100,0,0.0,1.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0


In [0]:
test.head(3)

Unnamed: 0,State,County,Population,Number of children tested,Percentage of children with levels higher than 5 ug/dl,Percentage of children with levels higher than 10 ug/dl,Percentage of population under the poverty line
0,AL,Autauga County,55136,29,Unknown,0.0,12.8
1,AL,Baldwin County,191205,357,1.70%,0.6,13.8
2,AL,Barbour County,27119,23,Unknown,4.3,24.1


According to [Medline Plus](https://medlineplus.gov/ency/article/003360.htm), an abnormal level of blood level for children is a blood lead level of 5 µg/dL or greater. Anything greater than this requires further testing and monitoring and the source of lead must be found and removed. In the dataset, we are most interested in the levels of lead above Blood lead level of 5 µg/dL. 
Let's rename some columns and drop columns and rows that are not useful to us.

In [0]:
# Clean Childhood Blood Lead Levels Data

'''Drop rows that do not have any measure of blood lead levels'''
df1.drop(
    df1[
        (df1["less_than_5_mcg_dl"].isnull()) &
        (df1["_5_10_mcg_dl"].isnull()) &
        (df1["_10to15"].isnull()) & 
        (df1["_15"].isnull())
    ].index, 
    inplace=True
)
'''Drop columns that do not add value to what we are measuring (coordinates)'''
df1.drop(
    columns=["county_location.coordinates",
        "county_location.type","zip_code_location.coordinates",
        "zip_code_location.type"], 
    inplace=True
)
'''Rename columns for readability'''
df1 = df1.rename(
    columns={
        "_10to15": "10to15mcg_dl",
        "_5_10_mcg_dl":"5to10mcg_dl",
        "_15": "greater_than_15_mcg_dl"
    })
df1.index = range(len(df1.index))
df1

Unnamed: 0,10to15mcg_dl,greater_than_15_mcg_dl,5to10mcg_dl,county,fips,less_than_5_mcg_dl,percent,rate_per_1_000,tests,total_eblls,year,zip
0,,,9,Albany,1,38,2.08,20.8,48,,2000,12143
1,,,6,Albany,1,31,,,37,,2001,12143
2,,,6,Albany,1,16,4.35,43.5,23,,2004,12143
3,,,7,Albany,1,43,,,50,,2005,12143
4,,,6,Albany,1,35,2.38,23.8,42,,2006,12143
5,,,7,Albany,1,21,,,28,,2007,12193
6,,,9,Albany,1,33,,,42,,2002,12158
7,,,10,Albany,1,46,1.75,17.5,57,,2004,12158
8,,,10,Albany,1,51,,,61,,2005,12158
9,,,14,Albany,1,66,2.44,24.4,82,,2006,12158


In [0]:
df1['county'].value_counts()

Albany      195
Allegany     28
Name: county, dtype: int64

In [0]:
from vega_datasets import data
imUsingColab = True

if imUsingColab:
    !pip install altair

try:
    import altair as alt
    if imUsingColab:
        alt.renderers.enable('colab')
    else:
        alt.renderers.enable('notebook')    
    imUsingAltair = True
    print('Altair successfully loaded.')

except ModuleNotFoundError:
    imUsingAltair = False
    print('Altair loading failed. Will default to matplotlib.')

Altair successfully loaded.


In [0]:
def map_data(census_df, variable):
    
    '''
    This function creates an interactive cloropleth map using a census variable. 
    '''    
    
    val = pd.read_csv('https://grantmlong.com/data/census_validation.csv', names=['id'])
    census_df['id'] = val
#     cf = val.merge(
#         census_df.loc[
#             census_df[variable]>-1000000,
#             [variable, 'County']],
#         left_on='id',
#         right_on='GEOID',
#         how='left'
#     )
    cf = census_df
    print(cf.head(3))
#     print(cf.County.head(3))
    counties = alt.topo_feature(
        'https://vega.github.io/vega-datasets/data/us-10m.json', 
        'counties'
    )

    census = alt.LookupData(cf, 'id', [variable, 'County'])

    census_map = alt.Chart(
        counties
    ).mark_geoshape(
    ).encode(
        color='{variable}:Q'.format(variable=variable),
        tooltip=[
            '{variable}:Q'.format(variable=variable),
            'County Name:N',
        ]
    ).transform_lookup(
        lookup='id',
        from_= census
    ).project(
        type='albersUsa'
    ).properties(
        width=850,
        height=500
    )
    
    return census_map


In [0]:
map_data(test, 'Percentage of children with levels higher than 10 ug/dl')

  State          County  Population Number of children tested  \
0    AL  Autauga County       55136                        29   
1    AL  Baldwin County      191205                       357   
2    AL  Barbour County       27119                        23   

  Percentage of children with levels higher than 5 ug/dl  \
0                                            Unknown       
1                                              1.70%       
2                                            Unknown       

  Percentage of children with levels higher than 10 ug/dl  \
0                                                  0        
1                                                0.6        
2                                                4.3        

   Percentage of population under the poverty line    id  
0                                             12.8  1001  
1                                             13.8  1003  
2                                             24.1  1005  
