In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

read data

In [20]:
YEAR = 2017
demographics_fpath = '../data/comscore/{year}/demographics.csv'.format(year=YEAR)
demo_df = (pd.read_csv(demographics_fpath, usecols=['household_income', 'racial_background', 'machine_id'])
           .assign(household_income = lambda x: x.household_income %10)
           .replace({99:np.nan, -88: np.nan, 8: np.nan})
          .dropna())

demo_df.to_csv("../output/comscore_panel_demo.csv")

In [21]:
demo_df

Unnamed: 0,machine_id,household_income,racial_background
0,46569906.0,6.0,1.0
1,70298839.0,4.0,1.0
2,76984170.0,6.0,1.0
3,76991725.0,6.0,1.0
5,81191519.0,2.0,1.0
...,...,...,...
93803,231325483.0,3.0,3.0
93804,231325684.0,4.0,1.0
93805,231325696.0,2.0,1.0
93806,231325697.0,6.0,5.0


In [22]:
state_census_df = pd.read_csv("../output/state_census.csv", dtype={'state_fips': str}).set_index('state_fips')
demo_df = pd.read_csv("../output/comscore_panel_demo.csv", index_col=0, dtype={'state_fips': str})
us_census_df = pd.read_csv("../output/us_census.csv", index_col=0)
state_census_df.head()


Unnamed: 0_level_0,DP02_0064E,DP02_0061E,DP02_0065E,DP02_0062E,DP02_0063E,DP02_0060E,DP02_0059E,DP05_0077E,DP05_0078E,DP05_0080E,...,DP03_0054E,DP03_0055E,DP03_0056E,DP03_0057E,DP03_0058E,DP03_0059E,DP03_0060E,DP03_0061E,population estimate,state_abbr
state_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53,1064440.0,1101904.0,622908.0,1174696.0,483549.0,260358.0,188200.0,5001482.0,251603.0,573457.0,...,217004.0,228787.0,335962.0,497935.0,372647.0,450796.0,194700.0,199867.0,7169967.0,WA
8,909794.0,791683.0,536679.0,798224.0,307078.0,191957.0,133062.0,3731976.0,209885.0,161733.0,...,165617.0,178164.0,261592.0,377542.0,281673.0,337068.0,146006.0,146896.0,5436519.0,CO
34,1439971.0,1722597.0,907865.0,1026060.0,400424.0,351184.0,315598.0,5023606.0,1136347.0,838872.0,...,247138.0,232026.0,322263.0,490325.0,390092.0,556938.0,296256.0,380045.0,8960161.0,NJ
17,1776326.0,2281524.0,1122258.0,1812247.0,682300.0,544799.0,446625.0,7955484.0,1806398.0,666890.0,...,446453.0,425803.0,593198.0,836760.0,613614.0,724960.0,311141.0,330930.0,12854526.0,IL
6,5291984.0,5345542.0,3161747.0,5586071.0,2021944.0,2033160.0,2510370.0,14777594.0,2161459.0,5427928.0,...,1105197.0,1063551.0,1465836.0,2095531.0,1568843.0,2025327.0,1008388.0,1255844.0,38982847.0,CA


In [23]:
census_mapping = pd.DataFrame([
    # education level. 
    # Note discrepancy: ACS is for individuals > 25 yrs
    # vs ComScore data for household
    ['DP02_0064E', 4, 'bachelors degree', 'hoh_most_education'],
    ['DP02_0061E', 1, 'high school graduate', 'hoh_most_education'],
    ['DP02_0065E', 5, 'graduate degree', 'hoh_most_education'],
    ['DP02_0062E', 2, 'some college, no degree', 'hoh_most_education'],
    ['DP02_0063E', 3, 'associates degree', 'hoh_most_education'],
    # two rows for comscore-coded zero. Should sum for comparison.
    ['DP02_0060E', 0, '9-12 grade no diploma', 'hoh_most_education'],
    ['DP02_0059E', 0, 'less than 9th grade', 'hoh_most_education'],
    # Racial Background
    # ComScore uses 1,2,3,5
    # TODO / Question: is non-hispanic encoding right one to use?
    # e.g. instead use DP05_0037PE
    ['DP05_0077E', 1, 'non-hispanic white', 'racial_background'], 
    ['DP05_0078E', 2, 'non-hispanic black', 'racial_background'],
    ['DP05_0080E', 3, 'non-hispanic asian', 'racial_background'],
    # ComScore 5: other
    
    # Country of origin: hispanic / non-hispanic
    ['DP05_0071E', 1, 'hispanic', 'country_of_origin'],
    
    # Household income
    # ComScore: 1 Less than 15k 2 15k-24.999k 3 25k-34.999k 
    # 4 35k-49.999k 5 50k-74.999k 6 75k-99.999k 7 100k+
    # two rows for ComScore-coded 1
    ['DP03_0052E', 1, 'Less than $10,000', 'household_income'],
    ['DP03_0053E', 1, '\$10,000 to $14,999', 'household_income'],
    ['DP03_0054E', 2, '\$15,000 to $24,999', 'household_income'],
    ['DP03_0055E', 3, '\$25,000 to $34,999', 'household_income'],
    ['DP03_0056E', 4, '\$35,000 to $49,999', 'household_income'],
    ['DP03_0057E', 5, '\$50,000 to $74,999', 'household_income'],
    ['DP03_0058E', 6, '\$75,000 to $99,999', 'household_income'],
    ['DP03_0059E', 7, '\$100,000 to $149,999', 'household_income'],
    ['DP03_0060E', 7, '\$150,000 to $199,999', 'household_income'],
    ['DP03_0061E', 7, '\$200,000 or more', 'household_income'],
    
], columns=['census_code', 'comscore_code', 'description', 'comscore_column'])

Income

In [24]:
census_incomes = census_mapping[census_mapping.comscore_column == 'household_income'][
    ['census_code','comscore_code']
].set_index('census_code')
census_incomes['count'] = us_census_df.transpose()
census_incomes

Unnamed: 0_level_0,comscore_code,count
census_code,Unnamed: 1_level_1,Unnamed: 2_level_1
DP03_0052E,1,7942251.0
DP03_0053E,1,5768114.0
DP03_0054E,2,11637905.0
DP03_0055E,3,11330288.0
DP03_0056E,4,15412493.0
DP03_0057E,5,21000314.0
DP03_0058E,6,14636046.0
DP03_0059E,7,16701857.0
DP03_0060E,7,6931136.0
DP03_0061E,7,7465517.0


In [25]:
census_race = census_mapping[census_mapping.comscore_column == 'racial_background'][
    ['census_code','comscore_code']
].set_index('census_code')
census_race['count'] = us_census_df.transpose()
census_race

Unnamed: 0_level_0,comscore_code,count
census_code,Unnamed: 1_level_1,Unnamed: 2_level_1
DP05_0077E,1,197277789.0
DP05_0078E,2,39445495.0
DP05_0080E,3,16989540.0


## Stratification
First, create a new column representing combination of values we'd like to stratify on. 

here, we want to stratify on income and race, so we combine them into a string value. The cell below prints out the value counts for each combination that we see in the comscore dataset:

In [26]:
demo_df = demo_df[~(demo_df.racial_background == -88)]
demo_df['stratify'] = demo_df.household_income.astype(str) + ", " + demo_df.racial_background.astype(str)
demo_df.stratify.value_counts()

1.0, 1.0    11021
3.0, 1.0    10206
2.0, 1.0     8907
6.0, 1.0     7450
5.0, 1.0     7278
4.0, 1.0     5753
1.0, 2.0     5458
1.0, 5.0     3908
2.0, 2.0     3730
7.0, 1.0     2953
2.0, 5.0     2765
3.0, 2.0     2762
3.0, 5.0     2330
6.0, 5.0     1598
5.0, 2.0     1553
5.0, 5.0     1511
6.0, 2.0     1396
4.0, 2.0     1217
4.0, 5.0     1214
1.0, 3.0      969
6.0, 3.0      903
3.0, 3.0      835
5.0, 3.0      755
2.0, 3.0      660
4.0, 3.0      548
7.0, 5.0      507
7.0, 2.0      411
7.0, 3.0      408
Name: stratify, dtype: int64

We can create the same dataset from the census data, but we need to use new variables that represent these combinations.

It turns out, we need to pull data from the CPS: https://www.census.gov/data/tables/time-series/demo/income-poverty/cps-hinc/hinc-02.2017.html

This data is difficult to parse, so I simply downloaded the excel files for each race alone category and combined them into one simple table:

To Do:
- [ ] I am coding hispanic as 'other' here (5) for convenience/as a current shortcut.. We should think more critically about how to match census data with the data from comscore.

In [27]:
cps_df = pd.read_csv("../data/CPS-race.csv", usecols=[0,1,2,3,4])[1:]

In [28]:
cps_df = pd.read_csv("../data/CPS-race.csv", usecols=[0,1,2,3,4])[1:]
# manually created mapping
cps_df['comscore_mapping'] = [1,1,1,2,2,3,3,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
cps_df = (cps_df.drop('Unnamed: 0', axis=1)
          # convert formatted numbers to numbers
          .apply(lambda x: pd.to_numeric(x.astype(str).str.replace(",", "")))
          .rename(columns={'white alone': 1,
                          'black alone': 2,
                          'asian alone': 3})
         )
cps_df['5'] = cps_df.total - cps_df[[1,2,3]].sum(axis=1)
cps_df = cps_df.drop('total', axis=1)



In [29]:
cps_stratify = (cps_df
# .drop('hispanic', axis=1)
 .groupby('comscore_mapping')
 .agg(sum)
 .melt(ignore_index=False)
 .reset_index()
 .astype(float)
 .assign(stratify=lambda x: x.comscore_mapping.astype(str) + ", " + x.variable.astype(str))
                [['stratify', 'value']].set_index('stratify')
 .assign(value = lambda x: x.value / sum(x.value))).value

In [30]:
cps_stratify

stratify
1.0, 1.0    0.070768
2.0, 1.0    0.071066
3.0, 1.0    0.069774
4.0, 1.0    0.096154
5.0, 1.0    0.132067
6.0, 1.0    0.101316
7.0, 1.0    0.242994
1.0, 2.0    0.026185
2.0, 2.0    0.017490
3.0, 2.0    0.016182
4.0, 2.0    0.018548
5.0, 2.0    0.021234
6.0, 2.0    0.012415
7.0, 2.0    0.021258
1.0, 3.0    0.004378
2.0, 3.0    0.003376
3.0, 3.0    0.003298
4.0, 3.0    0.005162
5.0, 3.0    0.007927
6.0, 3.0    0.006579
7.0, 3.0    0.022151
1.0, 5.0    0.004339
2.0, 5.0    0.003188
3.0, 5.0    0.003470
4.0, 5.0    0.003963
5.0, 5.0    0.005428
6.0, 5.0    0.003157
7.0, 5.0    0.006133
Name: value, dtype: float64

And, let's stratify our data from comscore:

In [31]:

def stratify_data_with_replacement(df_data, stratify_column_name, stratify_values, stratify_proportions, random_state=None):
    """Stratifies data according to the values and proportions passed in
    from: https://gist.github.com/grahamharrison68/9223f4e80bb2c553b3a31be3b7c854b5#file-stratify_4-py
    Args:
        df_data (DataFrame): source data
        stratify_column_name (str): The name of the single column in the dataframe that holds the data values that will be used to stratify the data
        stratify_values (list of str): A list of all of the potential values for stratifying e.g. "Male, Graduate", "Male, Undergraduate", "Female, Graduate", "Female, Undergraduate"
        stratify_proportions (list of float): A list of numbers representing the desired propotions for stratifying e.g. 0.4, 0.4, 0.2, 0.2, The list values must add up to 1 and must match the number of values in stratify_values
        random_state (int, optional): sets the random_state. Defaults to None.
    Returns:
        DataFrame: a new dataframe based on df_data that has the new proportions represnting the desired strategy for stratifying
    """
    df_stratified = pd.DataFrame(columns = df_data.columns) # Create an empty DataFrame with column names matching df_data

    pos = -1
    for i in range(len(stratify_values)): # iterate over the stratify values (e.g. "Male, Undergraduate" etc.)
        pos += 1
        if pos == len(stratify_values) - 1: 
            ratio_len = len(df_data) - len(df_stratified) # if this is the final iteration make sure we calculate the number of values for the last set such that the return data has the same number of rows as the source data
        else:
            ratio_len = int(len(df_data) * stratify_proportions[i]) # Calculate the number of rows to match the desired proportion

        df_filtered = df_data[df_data[stratify_column_name] ==stratify_values[i]] # Filter the source data based on the currently selected stratify value
        df_temp = df_filtered.sample(replace=True, n=ratio_len, random_state=random_state) # Sample the filtered data using the calculated ratio
        
        df_stratified = pd.concat([df_stratified, df_temp]) # Add the sampled / stratified datasets together to produce the final result
        
    return df_stratified # Return the stratified, re-sampled data   



def stratify_data_without_replacement(df_data, stratify_column_name, stratify_values, stratify_proportions, random_state=None):
    df_stratified = pd.DataFrame(columns = df_data.columns)
    max_idx = np.argmax(stratify_proportions)
    max_N = len(df_data[df_data[stratify_column_name] == stratify_values[max_idx]])
    true_N = (stratify_proportions / stratify_proportions[max_idx])*max_N
    
    for i,N in enumerate(true_N):
        df_resampled = (
            df_data[df_data[stratify_column_name] == stratify_values[i]]
            .sample(replace=False, n=int(N), random_state=random_state)
        )
        df_stratified = pd.concat([df_stratified, df_resampled])
    return df_stratified

In [32]:
demo_df_s = stratify_data_without_replacement(demo_df, 'stratify', cps_stratify.index, cps_stratify.values) 

In [33]:
demo_df_s.stratify.value_counts()

7.0, 1.0    2953
5.0, 1.0    1604
6.0, 1.0    1231
4.0, 1.0    1168
2.0, 1.0     863
1.0, 1.0     860
3.0, 1.0     847
1.0, 2.0     318
7.0, 3.0     269
7.0, 2.0     258
5.0, 2.0     258
4.0, 2.0     225
2.0, 2.0     212
3.0, 2.0     196
6.0, 2.0     150
5.0, 3.0      96
6.0, 3.0      79
7.0, 5.0      74
5.0, 5.0      65
4.0, 3.0      62
1.0, 3.0      53
1.0, 5.0      52
4.0, 5.0      48
3.0, 5.0      42
2.0, 3.0      41
3.0, 3.0      40
2.0, 5.0      38
6.0, 5.0      38
Name: stratify, dtype: int64

In [34]:
demo_stratified = demo_df_s.stratify.value_counts()
(pd.DataFrame(demo_stratified / demo_stratified.sum())
 .merge(cps_stratify, left_index=True, right_index=True)
 .rename(columns={"stratify": "comscore_proportion", "value": "cps_proportion"})
)


Unnamed: 0,comscore_proportion,cps_proportion
"7.0, 1.0",0.243245,0.242994
"5.0, 1.0",0.132125,0.132067
"6.0, 1.0",0.1014,0.101316
"4.0, 1.0",0.096211,0.096154
"2.0, 1.0",0.071087,0.071066
"1.0, 1.0",0.07084,0.070768
"3.0, 1.0",0.069769,0.069774
"1.0, 2.0",0.026194,0.026185
"7.0, 3.0",0.022158,0.022151
"7.0, 2.0",0.021252,0.021258


In [35]:
demo_df_s

Unnamed: 0,machine_id,household_income,racial_background,stratify
2288,169951672.0,1.0,1.0,"1.0, 1.0"
51901,213559693.0,1.0,1.0,"1.0, 1.0"
84742,227626218.0,1.0,1.0,"1.0, 1.0"
34352,201767313.0,1.0,1.0,"1.0, 1.0"
71723,222257533.0,1.0,1.0,"1.0, 1.0"
...,...,...,...,...
83255,226959018.0,7.0,5.0,"7.0, 5.0"
82422,226609365.0,7.0,5.0,"7.0, 5.0"
24419,193350368.0,7.0,5.0,"7.0, 5.0"
54511,214986027.0,7.0,5.0,"7.0, 5.0"
