# FLoC Cohorts from sessions panel data



In [1]:
from datetime import datetime
import sys
sys.path.append('..')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

import floc

from comscore.data import read_weeks_machines_domains
import prefixLSH


read in pre-processed sessions data

In [4]:
# read in the pre-processed sessions data 
# this maps week,machine_id -> domains set
weeks_machines_domains_fpath = '../output/weeks_machines_domains.csv'
weeks_machines_domains_df = read_weeks_machines_domains(weeks_machines_domains_fpath)
weeks_machines_domains_df.drop(['machine_id', 'domains'], axis=1).head()

reading from ../output/weeks_machines_domains.csv...
... read 4877236 rows


Unnamed: 0,week,n_domains
0,1,104
1,1,0
2,1,20
3,1,49
4,1,64


___

read in panels that were created based on the sessions data and demographic data

attach domains data

attach the simhash computed from the domains

precompute cohorts for each panel

In [5]:
all_panels_fpath = '../output/all_panels.csv'
all_panels_df = pd.read_csv(all_panels_fpath)
print('read in all panels: %s total rows' % len(all_panels_df))
print('%s panels' % all_panels_df.panel_id.nunique())
all_panels_df.drop(['machine_id'], axis=1).head()

read in all panels: 3302090 total rows
520 panels


Unnamed: 0,week,n_domains,household_income,racial_background,stratify,panel_id
0,1,40,1.0,1.0,"1.0, 1.0",1
1,1,43,1.0,1.0,"1.0, 1.0",1
2,1,147,1.0,1.0,"1.0, 1.0",1
3,1,45,1.0,1.0,"1.0, 1.0",1
4,1,22,1.0,1.0,"1.0, 1.0",1


In [6]:
weeks_machines_domains = weeks_machines_domains_df.set_index(['machine_id','week'])['domains']
all_panels_df['domains'] = all_panels_df.set_index(['machine_id','week']).index.map(weeks_machines_domains)
all_panels_df.drop(['machine_id','domains'], axis=1).head()

Unnamed: 0,week,n_domains,household_income,racial_background,stratify,panel_id
0,1,40,1.0,1.0,"1.0, 1.0",1
1,1,43,1.0,1.0,"1.0, 1.0",1
2,1,147,1.0,1.0,"1.0, 1.0",1
3,1,45,1.0,1.0,"1.0, 1.0",1
4,1,22,1.0,1.0,"1.0, 1.0",1


In [7]:
# apply simhash
all_panels_df['simhash'] = all_panels_df.domains.apply(floc.hashes.sim_hash_string)

Save intermediary output: panels with simhash

In [8]:
all_panels_simhash_fpath = '../output/all_panels_simhash.csv'

In [10]:
print('saving to %s...' % all_panels_simhash_fpath)
all_panels_df.drop('domains', axis=1).to_csv(all_panels_simhash_fpath, index=False)

saving to ../output/all_panels_simhash.csv...


script re-entry point

In [11]:
all_panels_df = pd.read_csv(all_panels_simhash_fpath)
print('read in all panels: %s total rows' % len(all_panels_df))
print('%s panels' % all_panels_df.panel_id.nunique())
all_panels_df.head()

read in all panels: 3302090 total rows
520 panels


Unnamed: 0,machine_id,week,n_domains,household_income,racial_background,stratify,panel_id,simhash
0,225477651,1,40,1.0,1.0,"1.0, 1.0",1,1053512748232354
1,215623532,1,43,1.0,1.0,"1.0, 1.0",1,207270634237270
2,186655210,1,147,1.0,1.0,"1.0, 1.0",1,155255686110552
3,224752776,1,45,1.0,1.0,"1.0, 1.0",1,765518950719653
4,207687017,1,22,1.0,1.0,"1.0, 1.0",1,1087583118580527


##### Pre-compute cohorts for each panel

each sample's cohort is dependent on the rest of the simhashes in the panel

for this reason, cohorts must be computed per panel

In [12]:
all_panels_cohorts_df = all_panels_df.copy()

In [None]:
# TODO: repeat this analysis with another min_k for a robustness test
# e.g. all_panels_cohorts80_df = all_panels_df.copy(); min_k=80
min_k = 40 
# preset all cohorts to None
# cohort_k40 = 'cohort_k40'
all_panels_df['cohort'] = np.nan

for panel_id in all_panels_df.panel_id.unique():
    t_start = datetime.now()
    if panel_id % 1 == 0:
        print('computing cohorts for panel %s/%s' % (panel_id, all_panels_df.panel_id.nunique()))
    panel_df = all_panels_df[all_panels_cohorts_df.panel_id==panel_id]
    cohorts_dict = prefixLSH.get_cohorts_dict(panel_df.simhash.astype(int), min_k=min_k)
    assign_cohort = lambda x: cohorts_dict[x.simhash] if x.panel_id == panel_id else x['cohort']
    all_panels_df['cohort'] = all_panels_df.apply(assign_cohort, axis=1)
    if panel_id % 1 == 0:
        print('took %s' % (datetime.now() - t_start))

computing cohorts for panel 1/520
took 0:00:52.600540
computing cohorts for panel 2/520
took 0:00:51.353196
computing cohorts for panel 3/520
took 0:00:50.400729
computing cohorts for panel 4/520
took 0:00:50.253033
computing cohorts for panel 5/520
took 0:00:48.421979
computing cohorts for panel 6/520
took 0:00:48.979739
computing cohorts for panel 7/520
took 0:00:49.001481
computing cohorts for panel 8/520
took 0:00:48.793433
computing cohorts for panel 9/520
took 0:00:50.414048
computing cohorts for panel 10/520
took 0:00:54.387241
computing cohorts for panel 11/520
took 0:00:46.691973
computing cohorts for panel 12/520
took 0:00:47.792375
computing cohorts for panel 13/520
took 0:00:48.431833
computing cohorts for panel 14/520
took 0:00:48.221698
computing cohorts for panel 15/520
took 0:00:49.071599
computing cohorts for panel 16/520
took 0:00:49.839058
computing cohorts for panel 17/520


In [None]:
all_panels_df.head(3) 

In [None]:
panel_df1 = all_panels_df[all_panels_df.panel_id == 1]
print('cohort sizes:')
cohort_counts = panel_df1['cohort'].value_counts()
print(cohort_counts.describe())

fig,ax = plt.subplots(2,1,figsize=(4,3), sharex=True, dpi=100)
cohort_counts.hist(ax=ax[0])
ax[0].set_ylabel('cohorts')
ax[1].set_ylabel('cohorts')
_ = ax[1].set_xlabel('cohort size')
_ = cohort_counts.hist(ax=ax[1], bins=50)

save intermediary output

In [None]:
all_panels_cohorts_fpath = '../output/all_panels_cohorts.csv'

In [None]:
print('saving to %s...' % all_panels_cohorts_fpath)
all_panels_df.to_csv(all_panels_cohorts_fpath, index=False)

script re-entry point

In [None]:
all_panels_df = pd.read_csv(all_panels_cohorts_fpath)
print('read in all panels: %s total rows' % len(all_panels_df))
print('%s panels' % all_panels_df.panel_id.nunique())
all_panels_df.head()

In [None]:
# get just one panel for now

In [None]:
panel_id = random.choice(all_panels_df.panel_id.unique())
print('using random panel id: %s' % panel_id)

In [29]:
panel_df = all_panels_df[all_panels_df.panel_id == panel_id].copy()

Unnamed: 0,week,n_domains,household_income,racial_background,stratify,panel_id,simhash
707826,8,36,1.0,1.0,"1.0, 1.0",112,598844775602146
707827,8,20,1.0,1.0,"1.0, 1.0",112,52417318348
707828,8,25,1.0,1.0,"1.0, 1.0",112,253662113111417
707829,8,10,1.0,1.0,"1.0, 1.0",112,313649201634270
707830,8,7,1.0,1.0,"1.0, 1.0",112,230404157507221


In [None]:
print('cohort sizes:')

cohorts_counts = panel_df["cohort"].value_counts()
print(cohort_counts.describe())

fig,ax = plt.subplots(2,1,figsize=(4,3), sharex=True, dpi=100)
cohort_counts.hist(ax=ax[0])
ax[0].set_ylabel('cohorts')
ax[1].set_ylabel('cohorts')
_ = ax[1].set_xlabel('cohort size')
_ = cohort_counts.hist(ax=ax[1], bins=50)

In [None]:
def get_cohort_demo_portions_df(panel_df, demo_col):
    df = panel_df.groupby(
        ['cohort', demo_col]
    ).count()['panel_id'].rename('count').to_frame().reset_index(demo_col)
    df['total'] = panel_df.groupby('cohort').count()['panel_id']
    df['portion'] = df['count']/df['total']
    return df

In [None]:
dpi = 100
t = 0.1

def plot_cohort_demo_portions(panel_df, demo_col, demo_value_names_dict):
    cohort_demo_portions_df = get_cohort_demo_portions_df(panel_df, demo_col)
    demo_fractions = panel_df.dropna().groupby(demo_col)['panel_id'].count() / len(panel_df.dropna())
    
    for val in demo_value_names_dict:
        print('%s' % demo_value_names_dict[val])
        cohort_demo_mean = cohort_demo_portions_df[cohort_demo_portions_df[demo_col]==val].portion.mean()
        print('mean fraction %s across cohorts = %0.2f' % (demo_value_names_dict[val], cohort_demo_mean))
        # Google's sensitivity analysis actually looked at avg for sensitive categories
        # across all users rather than across cohorts
        demo_fraction = demo_fractions[val]
        print('fraction %s across all users in cohorts = %0.2f' % (demo_value_names_dict[val], demo_fraction))

        fig,ax = plt.subplots(1,1,figsize=(6,3), dpi=dpi)
        cohort_demo_portions_df[cohort_demo_portions_df[demo_col]==val].portion.sort_values(ascending=False).plot.bar(ax=ax, label="")

        ax.hlines(y=demo_fraction, xmin=0, xmax=len(ax.get_xticks()), 
                  linestyles='-', color='black', label='mean')
        if (demo_fraction - t) > 0:
            ax.hlines(y=demo_fraction-t, xmin=0, xmax=len(ax.get_xticks()), 
                      linestyles='--', color='black', label='mean - t=0.1')
        ax.hlines(y=demo_fraction+t, xmin=0, xmax=len(ax.get_xticks()), 
                  linestyles='--', color='black', label='mean + t=0.1')
        #ax.legend()
        ax.set_xticks([])
        ax.set_xlabel('cohorts')
        ax.set_ylabel('fraction "%s"' % demo_value_names_dict[val])
        plt.show()
    
    return cohort_demo_portions_df

In [None]:
race_values = {1:'white', 2:'black', 3:'asian', 5:'other'}
hi_values = {1:'0 to 25k', 2:'25k to 50k', 3:'50k to 100k', 4:'100k +'}
hi_race_values = {
    "1.0, 1.0": "0 to 25k, white",
    "2.0, 1.0": "25k to 50k, white",
    "3.0, 1.0": "50k to 100k, white",
    "4.0, 1.0": "100k +, white",
    
    "1.0, 2.0": "0 to 25k, black",
    "2.0, 2.0": "25k to 50k, black",
    "3.0, 2.0": "50k to 100k, black",
    "4.0, 2.0": "100k +, black",
    
    "1.0, 3.0": "0 to 25k, asian",
    "2.0, 3.0": "25k to 50k, asian",
    "3.0, 3.0": "50k to 100k, asian",
    "4.0, 3.0": "100k +, asian",
    
    "1.0, 5.0": "0 to 25k, other",
    "2.0, 5.0": "25k to 50k, other",
    "3.0, 5.0": "50k to 100k, other",
    "4.0, 5.0": "100k +, other",
}

Looking at race

In [None]:
race = plot_cohort_demo_portions(panel_df, 'racial_background', race_values)
race.head()

Looking at just household income

In [None]:
hi = plot_cohort_demo_portions(panel_df, 'household_income', hi_values)
hi.head()

In [None]:
hi_race = plot_cohort_demo_portions(panel_df, 'stratify', hi_race_values)
hi_race