# PrefixLSH tests with domains data

In [2]:
from datetime import datetime
import sys
sys.path.append('..')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import floc

from comscore.data import read_weeks_machines_domains
import prefixLSH

read in some pre-processed sessions data

- starting domains list: take a list of domains with at least 20 domains (n_domains)

- sampling domains list: take another 20 different domains to sample from.

create 20 lists of domains, where each list i in 1,...20 duplicates list (i-1) and then modifies the list by replacing item i with a new domain from the sampling domains list

domains list 1 is the starting domains list

In [45]:
# read in the pre-processed sessions data 
# this maps week,machine_id -> domains set
weeks_machines_domains_fpath = '../output/weeks_machines_domains.csv'
weeks_machines_domains_df = read_weeks_machines_domains(weeks_machines_domains_fpath, nrows=100)
weeks_machines_domains_df.drop(['machine_id', 'domains'], axis=1).head()

reading from ../output/weeks_machines_domains.csv...
... read 100 rows


Unnamed: 0,week,n_domains
0,1,104
1,1,0
2,1,20
3,1,49
4,1,64


In [46]:
n_domains = 20

In [47]:
starting_domains_list = weeks_machines_domains_df[
    (weeks_machines_domains_df.n_domains == n_domains)
].domains.values[0]
starting_domains_list = list(starting_domains_list)
starting_domains_list

['photobucket.com',
 'mozilla.org',
 'mapquest.com',
 'refinery29.com',
 'rollingstone.com',
 'tarot.com',
 'sprouts.com',
 'snagajob.com',
 'ourdailysweepstakes.com',
 'messenger.com',
 'shein.com',
 'dreamlandjewelry.com',
 'cafemom.com',
 'numerologist.com',
 'sammsoft.com',
 'safeway.com',
 'itsmycareer.com',
 'yahoo.com',
 'pch.com',
 'wikipedia.org']

In [52]:
# use most frequent domains from sampled rows as other domain samples
other_domains = weeks_machines_domains_df.domains.values
other_domains = [d for domains in other_domains for d in domains if d not in starting_domains_list]
other_domains = pd.Series(other_domains).value_counts().head(20).index
other_domains

Index(['google.com', 'facebook.com', 'youtube.com', 'amazon.com', 'msn.com',
       'bing.com', 'live.com', '247-inc.net', 'walmart.com', 'pornhub.com',
       'pinterest.com', 'bestbuy.com', 'twitter.com', 'capitalone.com',
       'craigslist.org', 'apple.com', 'chase.com', 'go.com', 'ebay.com',
       'myway.com'],
      dtype='object')

make a dataframe with columns
```
m, domains, simhash, ot_cohort, cohort_k2, cohort_k4, cohort_k8,
```

where 
- each m differs from previous m by one domain
- ot_cohort generated via OT floc
- cohort_k* generated using our prefixLSH with given k

here the domains gradually transition from some unique user's domains to the most popular

In [55]:
m = list(range(20))
m

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [70]:
domains = [starting_domains_list]
for i in range(19):
    i1_domains = domains[i][:] # copy domains list from i
    i1_domains[i] = other_domains[i] # alter domains by one entry
    domains += [i1_domains]

In [76]:
test_df = pd.DataFrame({'m':m, 'domains':domains})
test_df['simhash'] = test_df.domains.apply(floc.hashes.sim_hash_string)
test_df['ot_cohort'] = test_df.domains.apply(floc.simulate)
for k in [2, 4, 8]:
    cohort_k = 'cohort_k%s'%k
    cohorts_dict = prefixLSH.get_cohorts_dict(test_df.simhash.astype(int), min_k=k)
    test_df[cohort_k] = test_df.simhash.map(cohorts_dict)
test_df

Unnamed: 0,m,domains,simhash,ot_cohort,cohort_k2,cohort_k4,cohort_k8
0,0,"[photobucket.com, mozilla.org, mapquest.com, r...",532288064550496,14043,2,1,1
1,1,"[google.com, mozilla.org, mapquest.com, refine...",461921065203298,12062,2,1,1
2,2,"[google.com, facebook.com, mapquest.com, refin...",426736190256674,10921,2,1,1
3,3,"[google.com, facebook.com, youtube.com, refine...",426736189732450,10921,2,1,1
4,4,"[google.com, facebook.com, youtube.com, amazon...",426744914122338,10921,2,1,1
5,5,"[google.com, facebook.com, youtube.com, amazon...",427286314718752,10942,2,1,1
6,6,"[google.com, facebook.com, youtube.com, amazon...",426732196808288,10921,2,1,1
7,7,"[google.com, facebook.com, youtube.com, amazon...",470716689483362,12277,2,1,1
8,8,"[google.com, facebook.com, youtube.com, amazon...",470715649262178,12277,2,1,1
9,9,"[google.com, facebook.com, youtube.com, amazon...",1034215358694082,30344,3,2,2
