In [5]:
from datetime import datetime
import sys
sys.path.append('..')
%autoreload 2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import floc

from comscore.data import read_weeks_machines_domains

sessions_fpath = '../data/comscore/2017/sessions.csv'

In [3]:
weeks_machines_domains_fpath = '../output/weeks_machines_domains.csv'

wmd = read_weeks_machines_domains(weeks_machines_domains_fpath)

reading from ../output/weeks_machines_domains.csv...
... read 4877236 rows


In [23]:
machine_weeks_df = (wmd
                    .assign(domains = lambda x: x.domains.apply(lambda y: [d for d in y if d != '']))
                    .assign(n_domains=lambda x: x.domains.apply(len)))

In [24]:
machine_weeks_df.head()

Unnamed: 0,machine_id,week,n_domains,domains
0,46569906,1,119,"[tricoathletics.com, webassign.net, opinionshe..."
1,70298839,1,0,[]
2,76984170,1,2,"[google.com, nbcsports.com]"
3,76991725,1,5,"[signaturesalon.us, facebook.com, salonrunner...."
4,81191519,1,8,"[adobeconnect.com, myway.com, docusign.net, ms..."


---

In [52]:
import pandas as pd
from config import N_PANELS, COMSCORE_YEAR, N_CORES, INCOME_MAPPING
from comscore.data import read_cps_df, read_comscore_demo_df
from comscore.panel import generate_stratified_sample, stratify_cps
from comscore.utils.stratify import stratify_data_without_replacement
from joblib import Parallel, delayed

print("Reading CPS and comscore data...")
cps_df = read_cps_df(fpath="../data/CPS-race.csv")
comscore_demo_df = read_comscore_demo_df(fpath='../data/comscore/{year}/demographics.csv', year=COMSCORE_YEAR)

Reading CPS and comscore data...


In [16]:
# collapse income categories to 4 categories
cps_df['comscore_mapping'] = cps_df.comscore_mapping.apply(lambda x: INCOME_MAPPING[x])
comscore_demo_df['household_income'] = comscore_demo_df.household_income.apply(lambda x: INCOME_MAPPING[x])


Done. Generating panels...


In [38]:
%%time
N_DOMAINS_THRESHOLD = 7
df = (machine_weeks_df
      .query('n_domains >= 7')
      .merge(comscore_demo_df, 
                       how='left', 
                       left_on='machine_id', 
                       right_on='machine_id')
      
     )

CPU times: user 5.35 s, sys: 521 ms, total: 5.87 s
Wall time: 5.86 s


In [42]:
df.head()

Unnamed: 0,machine_id,week,n_domains,domains,household_income,racial_background
0,46569906,1,119,"[tricoathletics.com, webassign.net, opinionshe...",3.0,1.0
1,81191519,1,8,"[adobeconnect.com, myway.com, docusign.net, ms...",1.0,1.0
2,92330491,1,45,"[freeldssheetmusic.org, ashleyhallmusic.com, m...",2.0,1.0
3,93557605,1,9,"[obsev.com, yahoo.com, msn.com, rosewe.com, em...",1.0,1.0
4,99534294,1,35,"[gearbest.com, glavpost.com, yahoo.com, contex...",3.0,1.0


In [53]:
def generate_weekly_stratified_samples(cps_df, 
                                       comscore_demo_df, 
                                       machine_weeks_df, 
                                       seed_value=datetime.now()):
    import random
    random.seed(seed_value)
    cps_stratify = stratify_cps(cps_df)
    comscore_demo_df['stratify'] = (
        comscore_demo_df.household_income.astype(float).astype(str) \
        + ", " \
        + comscore_demo_df.racial_background.astype(float).astype(str)
    )
    df = (machine_weeks_df
      .query('n_domains >= 7')
      .merge(comscore_demo_df, 
                       how='left', 
                       left_on='machine_id', 
                       right_on='machine_id')
      
     )
    # psuedocode - adjust accordingly after using saved machine,week data
    ## read_weeks_machines_domains()
    weekly_panels = []
    for week in df.week.unique():
        week_df = df[df.week == week]
        demo_df_s = stratify_data_without_replacement(
            week_df, "stratify", cps_stratify.index, cps_stratify.values
        )
        weekly_panels.append(demo_df_s)
    return pd.concat(weekly_panels)

In [None]:
%%time
n_panels = 10
all_panels = []
for p in range(1, n_panels + 1):
    # 52 panels
    panel_weeks = generate_weekly_stratified_samples(cps_df, comscore_demo_df, machine_weeks_df)
    panel_weeks.panel_id = panel_weeks.week * p
    all_panels.append(panel_weeks)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed_value)


extract just machine_id, week, panel_id from this dataframe

In [None]:


print("Generating panels...")

panels = Parallel(n_jobs=N_CORES)(delayed(generate_stratified_sample)(cps_df, comscore_demo_df,
    seed_value=n) for n in range(N_PANELS))

print("Done. Assigning panel IDs and writing to disk...")

panels = [p.assign(panel_id=n) for n, p in enumerate(panels)]
all_panels = pd.concat(panels).reset_index(drop=True)
all_panels.to_csv('output/all_panels.csv')