# Check sessions demographic data

The sessions data has demographic information in it.

The demographic data we use is from a separate file with just demographics data (linked to the machine_ids).

Do these demographic data match? 
Here we check

In [44]:
import sys
sys.path.append('..')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from comscore.data import read_cps_df, read_comscore_demo_df

YEAR = 2017 
demographics_fpath = '../data/comscore/{year}/demographics.csv'.format(year=YEAR)
sessions_fpath = '../data/comscore/2017/sessions.csv'

In [45]:
CHUNKSIZE=100000

In [46]:
sessions_chunk = pd.read_csv(sessions_fpath, nrows=CHUNKSIZE, encoding="ISO-8859-1")
sessions_chunk.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,machine_id,site_session_id,domain_id,ref_domain__name,pages_viewed,duration,event_date,event_time,tran_flg,prod_category_id,...,census_region,household_size,hoh_oldest_age,household_income,children,racial_background,connection_speed,country_of_origin,zip_code,domain_name
0,169007206,2296065101119308896,,,13.0,33,20171007,22:45:57,1.0,7009031000.0,...,3,5,3,12,1,2,1,1,36832.0,
1,169007206,2296065101119308896,,,13.0,33,20171007,22:45:57,1.0,999.0,...,3,5,3,12,1,2,1,1,36832.0,
2,170331223,4736010797527157206,,,4.0,5,20170808,0:11:58,1.0,1002003000.0,...,1,5,5,12,1,5,1,0,15120.0,
3,170422065,7818055568167238569,,,31.0,46,20170818,18:31:17,1.0,999.0,...,4,2,9,13,1,1,1,0,80233.0,
4,170422065,7818055568167238569,,,31.0,46,20170818,18:31:17,1.0,1002004000.0,...,4,2,9,13,1,1,1,0,80233.0,


In [49]:
sessions_chunk.domain_name.value_counts().head(10)

adxyield.com                 19310
silhouettedesignstore.com     9082
sixflags.com                  3159
schoolwires.net               2353
salary.com                    2133
gta5-mods.com                 1770
helios12.com                  1701
putload.tv                    1663
ipayment.com                  1473
cps.edu                       1325
Name: domain_name, dtype: int64

In [50]:
def read_comscore_df(fpath, nrows=None):
    demo_df = (
        pd.read_csv(
            fpath, 
            usecols=['household_income', 'racial_background', 'machine_id'],
            nrows=nrows,
            encoding="ISO-8859-1",
        ).assign(household_income = lambda x: x.household_income % 10)
        .replace({99:np.nan, -88: np.nan, 8: np.nan})
        .dropna()
    )
    return demo_df

In [51]:
sessions_chunk = read_comscore_df(sessions_fpath, nrows=CHUNKSIZE)
print('rows read in fpr sessions data: %s' % CHUNKSIZE)
print('rows returned after dropping invalid data: %s ' % len(sessions_chunk))
print('%s unique machines ids' % sessions_chunk.machine_id.nunique())
sessions_chunk.head()

rows read in fpr sessions data: 100000
rows returned after dropping invalid data: 94713 
23089 unique machines ids


Unnamed: 0,machine_id,household_income,racial_background
0,169007206.0,2.0,2.0
1,169007206.0,2.0,2.0
2,170331223.0,2.0,5.0
3,170422065.0,3.0,1.0
4,170422065.0,3.0,1.0


In [52]:
comscore_demo_df = read_comscore_df(fpath=demographics_fpath)
print('comscore demographics df : %s rows' % len(comscore_demo_df))

comscore demographics df : 89006 rows


In [54]:
# join the comscore df with the sessions data
merged_df = sessions_chunk.merge(
    comscore_demo_df,
    on='machine_id',
    how='left'
)
merged_df.head()

Unnamed: 0,machine_id,household_income_x,racial_background_x,household_income_y,racial_background_y
0,169007206.0,2.0,2.0,2.0,2.0
1,169007206.0,2.0,2.0,2.0,2.0
2,170331223.0,2.0,5.0,2.0,5.0
3,170422065.0,3.0,1.0,3.0,1.0
4,170422065.0,3.0,1.0,3.0,1.0


In [55]:
merged_df['match'] = merged_df.apply(lambda x: (x.household_income_x==x.household_income_x) and (x.racial_background_x==x.racial_background_y), axis=1)

In [56]:
merged_df.head()

Unnamed: 0,machine_id,household_income_x,racial_background_x,household_income_y,racial_background_y,match
0,169007206.0,2.0,2.0,2.0,2.0,True
1,169007206.0,2.0,2.0,2.0,2.0,True
2,170331223.0,2.0,5.0,2.0,5.0,True
3,170422065.0,3.0,1.0,3.0,1.0,True
4,170422065.0,3.0,1.0,3.0,1.0,True


In [57]:
merged_df.match.sum() == len(merged_df)

True

Outcome: At least in the large chunk of data checked, the machine_ids match w.r.t. demographic (race & income) information across the sessions data and demographics data.