# ComScore data: Explore demographics

In [1]:
!ls ../data/comscore/2017

2017transactions.csv  codebook.pdf  demographics.csv


In [2]:
%pip install census us matplotlib pyzipcode joblib

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import numpy as np
from census import Census
import us
c = Census("14a343e581b5dd1df551f4cd87f31aede50ec908")

%matplotlib inline

In [9]:
def get_state_fips(zipcode, err=False):
    """returns a state FIPS code for a given zip.
    
    if it can't find the zip code, return np.nan if err=False.
    
    parameters
    ----------
    err: boolean. True if function should raise a ValueError if it can't find a zip code. False by default.
    """
    from pyzipcode import ZipCodeDatabase
    zcdb = ZipCodeDatabase()
    if zipcode not in zcdb:
        if err:
            raise ValueError("Zip code not found")
        else:
            return np.nan
    zipcode = zcdb[zipcode]
    state = us.states.lookup(zipcode.state)
    return state.fips

In [10]:
demo_df = (pd.read_csv("../data/comscore/2017/demographics.csv", dtype={'zip_code': str})
           .dropna(subset=['zip_code'])
           .replace({'hoh_most_education': {99: np.nan}})
           # make sure zips are zip5
           .assign(zip_code=lambda x: x.zip_code.apply(lambda z: '0' + z if len(z) < 5 else z))
           # get state fips code for each zip
           .assign(state_fips=lambda x: x.zip_code.apply(get_state_fips))
           .dropna(subset=['state_fips']) # drops ~600 rows
     )

In [11]:
demo_df.head()

Unnamed: 0,machine_id,hoh_most_education,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,connection_speed,country_of_origin,zip_code,state_fips
0,46569906,,4,5,8,16,0,1,1,0,98611,53
1,70298839,3.0,4,3,8,14,0,1,1,0,80910,8
2,76984170,4.0,1,3,11,16,1,1,1,0,7076,34
3,76991725,3.0,2,1,9,16,0,1,1,0,62554,17
5,81191519,1.0,4,3,7,12,0,1,1,1,93436,6


## Retrieving census data for zip codes

Let's test getting relevant census data. I went [here](https://api.census.gov/data/2017/acs/acs5/profile/variables.html) and looked up relevant variables and just hand-coded them below. I added a mapping to the relevant comscore coding.

TODO: 

- [ ] Add additional codes for other variables

In [13]:
census_mapping = pd.DataFrame([
    # education level. 
    ['DP02_0064PE', 4, 'bachelors degree', 'hoh_most_education'],
    ['DP02_0061PE', 1, 'high school graduate', 'hoh_most_education'],
    ['DP02_0065PE', 5, 'graduate degree', 'hoh_most_education'],
    ['DP02_0062PE', 2, 'some college, no degree', 'hoh_most_education'],
    ['DP02_0063PE', 3, 'associates degree', 'hoh_most_education'],
    # two rows for comscore-coded zero. Should sum for comparison.
    ['DP02_0060PE', 0, '9-12 grade no diploma', 'hoh_most_education'],
    ['DP02_0059PE', 0, 'less than 9th grade', 'hoh_most_education'],
    ['DP05_0077PE', 1, 'non-hispanic white', 'racial_background'],
    ['DP05_0078PE', 2, 'non-hispanic black', 'racial_background'],
    ['DP05_0080PE', 3, 'non-hispanic asian', 'racial_background'],
], columns=['census_code', 'comscore_code', 'description', 'comscore_column'])

In [14]:
def get_census_data_for_zip(zipcode, var_names, year=2017):
    """retrieves census variables for a given zipcode and year from the ACS
    
    parameters
    ----------
    zipcode: string. zip code to search for
    var_names: list. list of variable names from the ACS.
    
    returns
    -------
    A dict with a key for each var in var_names where the value
    is the corresponding value for the zipcode, plus a key for `state` with 
    the corresponding FIPS code and a key for `zip code tabulation area` 
    that parrots the zip.
    
    example
    -------
    >>> get_census_data_for_zip('02130', ['DP05_0080PE'])
    """
    state_fips = get_state_fips(zipcode)
    if not state_fips:
        return None
    try:
        result = c.acs5dp.state_zipcode(
            var_names,
            state_fips,
            zipcode,
            year=year)[0]
        return result
    except Exception as e:
        print("Couldn't retrieve census data for zip {}: {}".format(zipcode, e))
        return None

r = demo_df.iloc[0]
zips = list(set(demo_df.zip_code))
zip_demos = c.acs5dp.state_zipcode(list(census_mapping.census_code.values), r.state_fips, r.zip_code, year=2017)

In [16]:
census_vars = list(census_mapping.census_code.values)

It takes a while, so run in parallel (way faster)

In [18]:
from joblib import Parallel, delayed
N_CORES = 24

%time zip_census_data = Parallel(n_jobs=N_CORES)(delayed(get_census_data_for_zip)(zip, census_vars) for zip in zips)
census_df = pd.DataFrame([z for z in zip_census_data if z is not None])

%mkdir ../output
census_df.to_csv("../output/zip_census.csv")

KeyboardInterrupt: 

NameError: name 'zip_census_data' is not defined

In [21]:
census_df = pd.read_csv("../output/zip_census.csv", index_col=0)

## Create corresponding zip code aggregates from comscore demographics

In [23]:
demo_df.head()

Unnamed: 0,machine_id,hoh_most_education,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,connection_speed,country_of_origin,zip_code,state_fips
0,46569906,,4,5,8,16,0,1,1,0,98611,53
1,70298839,3.0,4,3,8,14,0,1,1,0,80910,8
2,76984170,4.0,1,3,11,16,1,1,1,0,7076,34
3,76991725,3.0,2,1,9,16,0,1,1,0,62554,17
5,81191519,1.0,4,3,7,12,0,1,1,1,93436,6


In [102]:
state_df = (pd.melt(demo_df,
       id_vars=['state_fips'],
       value_vars=['hoh_most_education', 'racial_background'],
       value_name='val',
       var_name='var_name').groupby(['state_fips', 'var_name', 'val'])).val.agg(['count'])
state_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
state_fips,var_name,val,Unnamed: 3_level_1
01,hoh_most_education,1.0,25
01,hoh_most_education,2.0,447
01,hoh_most_education,3.0,304
01,hoh_most_education,4.0,192
01,hoh_most_education,5.0,32
...,...,...,...
56,hoh_most_education,5.0,3
56,racial_background,1.0,102
56,racial_background,2.0,11
56,racial_background,3.0,3


In [111]:
(state_df
 .groupby(['state_fips', 'var_name'])
 .agg({'count': np.sum}))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
state_fips,var_name,Unnamed: 2_level_1
01,hoh_most_education,1000
01,racial_background,1525
02,hoh_most_education,98
02,racial_background,167
04,hoh_most_education,901
...,...,...
54,racial_background,599
55,hoh_most_education,970
55,racial_background,1451
56,hoh_most_education,86


In [116]:
state_vars = (state_df.merge(state_df
 .groupby(['state_fips', 'var_name'])
 .agg({'count': np.sum}), 
            how='left',
            right_index=True,
            left_on=['state_fips', 'var_name'])
 .assign(pct=lambda x: x.count_x/x.count_y)
).rename(columns={'count_x': 'count', 'count_y': 'total'})

In [117]:
state_vars.to_csv('../output/state_census_distributions.csv')