In [None]:
import pandas as pd
import numpy as np
import sys
%autoreload 2
%matplotlib inline
sys.path.append('..')

# Transactions -> Domains

Read data and convert to domain sets for each `machine_id, week` pair

In [None]:
panels = pd.read_csv("../output/all_panels.csv", index_col=0)

transactions_fpath = '../data/comscore/2017/transactions.csv'
transactions_df = pd.read_csv(transactions_fpath, parse_dates=['event_date'])[['machine_id', 'event_date', 'domain_name']]

In [None]:
tdf = (transactions_df
 .assign(week=lambda x: x.event_date.dt.week)
 .dropna()
 .groupby(['machine_id', 'week'])
 .agg({'domain_name': set}).reset_index()
       .assign(n_domains=lambda x: x.domain_name.map(len))
       .assign(domain_name=lambda x: x.domain_name.map(lambda y: "|".join(list(y))))
      )

In [None]:
tdf.sort_values('n_domains', ascending=False).head(10)

In [None]:
tdf.to_csv('../output/domains_by_week.csv')

## Fake Floc

1. assign domain list history to machine IDs
2. For each panel, calculate cohort assignment for a given week
3. calculate t-closeness across stratified variables and other variables of interest

In [None]:
from floc import simulate

In [None]:
import itertools
unique_domains = list(set(itertools.chain(*[d.split("|") for d in tdf.domain_name])))

def pad_domains(l, s):
    n = 7 - len(l)
    if n > 0:
        return list(l) + list(np.random.choice(s, n))
    else:
        return l

tdf['padded_domain'] = tdf.apply(lambda x: pad_domains(x.domain_name.split("|"), unique_domains), axis=1)

In [None]:
tdf.head()

In [None]:
padded_domains = [d for d in tdf.padded_domain]
cohorts = []
for d in padded_domains:
    cohorts.append(simulate(d, check_sensiveness=False))

tdf['cohort'] = cohorts

In [None]:
tdf.head()

In [None]:
len(tdf)

In [None]:
tdf.cohort.value_counts().describe()

This shows us that the existing clusters are far too small for our original t-closeness plan. But these cohort IDs are generated from the *actual* cohorts used in the OT, which resulted in ~30k cohorts. 

We found it surprising that we see representation from 18k cohorts from so few domains + people. 

We decided to move on to calculating our own cohorts from the data itself due to our smaller sample size. 

The argument remains the same: we want to test if FLoC violates reasonable t-closeness restrictions on demography.

## Cohort Numbers

In [None]:
panels.groupby(['panel_id']).agg({'machine_id': 'nunique'})

as a first pass if cohorts are evenly distributed, how large will our cohorts be (from 1 week of data rather than multiple weeks as independent samples)?

(this is the upper limit)

In [None]:
23670/50

This seems OK! For now!

## Calculating SimHash & PrefixLSH

from https://github.com/hybridtheory/floc-simhash

We first need to compute SimHash, and then run the 'prefixLSH' routine that splits each based on the 0/1 bit successively./ 

In [None]:
%pip install sklearn floc_simhash

In [None]:
tdf['padded_domain_string'] = tdf.padded_domain.map(lambda x: "|".join(x))

In [None]:
from floc_simhash import SimHash
hasher = SimHash(n_bits=50, tokenizer=lambda x: x.split("|"))
hashes = [hasher.hash(d) for d in tdf.padded_domain_string]
tdf['simhash'] = hashes

In [None]:
tdf.to_csv("../output/transaction_domain_simhash.csv")

In [None]:
tdf.simhash

OK, now we have the simhash of each domain series (padded) in the transaction DF.

Now to apply prefixLSH. How is it implemented?

In [None]:
from bitarray import bitarray
ba = bitarray()
min_cluster_size = 50
a = ba.frombytes(str.encode(hashes[0]))

In [None]:
ba

In [None]:
len(hashes[0])

In [None]:
str.encode(hashes[0])

---

# Old Shit

In [None]:
import requests
etlds = [requests.get("http://" + d).url for d in unique_domains]

Find all "blocked" domains from FloC 

In [None]:
blocked_domains = list(set(itertools.chain(*blocked)))

In [None]:
ds = [[d] * 7 for d in unique_domains]
blocked_domains = []
for d in ds:
    try:
        simulate(d)
    except Exception as e:
        print(e)
        blocked_domains.append(d[0])

In [None]:
blocked_domains

In [None]:
import socket

In [None]:
socket.getfqdn('ww.' + blocked_domains[0])

In [None]:
len(cohorts)

In [None]:
len(blocked)

In [None]:
tdf.padded_domain[3]

In [None]:
sorting_cluster_data = ""

In [None]:
tdf['cohort'] = [simulate(domains, sorting_cluster_data) for domains in tdf.padded_domain]