# ComScore data: Explore demographics

In [4]:
!ls ../data/comscore/2017

2017transactions.csv  codebook.pdf  demographics.csv


In [32]:
%pip install census us matplotlib pyzipcode

Collecting pyzipcode
  Downloading pyzipcode-3.0.1.tar.gz (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 12.3 MB/s 
Building wheels for collected packages: pyzipcode
  Building wheel for pyzipcode (setup.py) ... [?25ldone
[?25h  Created wheel for pyzipcode: filename=pyzipcode-3.0.1-py3-none-any.whl size=1932413 sha256=1cb740c1301c86dded7286abdf3cebe93a0a73108af318513545d2525ca8459a
  Stored in directory: /home/dcalacci/.cache/pip/wheels/c8/68/c3/537050586cf6f259376d99370b519925524d59c8652c5aedad
Successfully built pyzipcode
Installing collected packages: pyzipcode
Successfully installed pyzipcode-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import numpy as np
import census
c = Census("14a343e581b5dd1df551f4cd87f31aede50ec908")

%matplotlib inline

In [117]:
def get_state_fips(zipcode, err=False):
    """returns a state FIPS code for a given zip.
    
    if it can't find the zip code, return np.nan if err=False.
    
    parameters
    ----------
    err: boolean. True if function should raise a ValueError if it can't find a zip code. False by default.
    """
    from pyzipcode import ZipCodeDatabase
    zcdb = ZipCodeDatabase()
    if zipcode not in zcdb:
        if err:
            raise ValueError("Zip code not found")
        else:
            return np.nan
    zipcode = zcdb[zipcode]
    state = us.states.lookup(zipcode.state)
    return state.fips

In [87]:
demo_df = (pd.read_csv("../data/comscore/2017/demographics.csv", dtype={'zip_code': str})
           .dropna(subset=['zip_code'])
           .replace({'hoh_most_education': {99: np.nan}})
           # make sure zips are zip5
           .assign(zip_code=lambda x: x.zip_code.apply(lambda z: '0' + z if len(z) < 5 else z))
           # get state fips code for each zip
           .assign(state_fips=lambda x: x.zip_code.apply(get_state_fips))
           .dropna(subset=['state_fips']) # drops ~600 rows
     )

Dropped 0 zip rows


In [98]:
demo_df.head()

Unnamed: 0,machine_id,hoh_most_education,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,connection_speed,country_of_origin,zip_code,state_fips
0,46569906,,4,5,8,16,0,1,1,0,98611,53
1,70298839,3.0,4,3,8,14,0,1,1,0,80910,8
2,76984170,4.0,1,3,11,16,1,1,1,0,7076,34
3,76991725,3.0,2,1,9,16,0,1,1,0,62554,17
5,81191519,1.0,4,3,7,12,0,1,1,1,93436,6


Let's test getting relevant census data. I went [here](https://api.census.gov/data/2017/acs/acs5/profile/variables.html) and looked up relevant variables and just hand-coded them below. I added a mapping to the relevant comscore coding:

In [122]:
census_mapping = pd.DataFrame([
    # education level. 
    ['DP02_0064PE', 4, 'bachelors degree', 'hoh_most_education'],
    ['DP02_0061PE', 1, 'high school graduate', 'hoh_most_education'],
    ['DP02_0065PE', 5, 'graduate degree', 'hoh_most_education'],
    ['DP02_0062PE', 2, 'some college, no degree', 'hoh_most_education'],
    ['DP02_0063PE', 3, 'associates degree', 'hoh_most_education'],
    # two rows for comscore-coded zero. Should sum for comparison.
    ['DP02_0060PE', 0, '9-12 grade no diploma', 'hoh_most_education'],
    ['DP02_0059PE', 0, 'less than 9th grade', 'hoh_most_education'],
    ['DP05_0077PE', 1, 'non-hispanic white', 'racial_background'],
    ['DP05_0078PE', 2, 'non-hispanic black', 'racial_background'],
    ['DP05_0080PE', 3, 'non-hispanic asian', 'racial_background'],
], columns=['census_code', 'comscore_code', 'description', 'comscore_column'])

def get_census_data_for_zip(zipcode, year=2017):
    state_fips = get_state_fips(zipcode)
    if not state_fips:
        return None
    try:
        result = c.acs5dp.state_zipcode(list(census_mapping.census_code.values),
                              state_fips,
                              zipcode,
                              year=year)[0]
        return result
    except Exception as e:
        print("Couldn't retrieve census data for zip {}".format(zipcode))
        return None

r = demo_df.iloc[0]
zips = list(set(demo_df.zip_code))
zip_demos = c.acs5dp.state_zipcode(list(census_mapping.census_code.values), r.state_fips, r.zip_code, year=2017)

In [None]:
census_df = pd.DataFrame([get_census_data_for_zip(z) for z in zips])

Couldn't retrieve census data for zip 18946
Couldn't retrieve census data for zip 76121
Couldn't retrieve census data for zip 92112
Couldn't retrieve census data for zip 29290
Couldn't retrieve census data for zip 23090
Couldn't retrieve census data for zip 53940
Couldn't retrieve census data for zip 31209
Couldn't retrieve census data for zip 95611
Couldn't retrieve census data for zip 61656
Couldn't retrieve census data for zip 28302
Couldn't retrieve census data for zip 55555
Couldn't retrieve census data for zip 08888
Couldn't retrieve census data for zip 11969
Couldn't retrieve census data for zip 94712
Couldn't retrieve census data for zip 85001
Couldn't retrieve census data for zip 93227
Couldn't retrieve census data for zip 54903
Couldn't retrieve census data for zip 33840
Couldn't retrieve census data for zip 46778
Couldn't retrieve census data for zip 12544
Couldn't retrieve census data for zip 30320
Couldn't retrieve census data for zip 56563
Couldn't retrieve census data fo