# PrefixLSH tests with domains data

In [1]:
import sys
sys.path.append('..')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import floc

from utils.data import read_weeks_machines_domains
import prefixLSH

read in some pre-processed sessions data

- starting domains list: take a list of domains with at least 20 domains (n_domains)

- sampling domains list: take another 20 different domains to sample from.

create 20 lists of domains, where each list i in 1,...20 duplicates list (i-1) and then modifies the list by replacing item i with a new domain from the sampling domains list

domains list 1 is the starting domains list

In [2]:
# read in the pre-processed sessions data 
# this maps week,machine_id -> domains set
weeks_machines_domains_fpath = '../output/weeks_machines_domains.csv'
weeks_machines_domains_df = read_weeks_machines_domains(weeks_machines_domains_fpath, nrows=100)
weeks_machines_domains_df.drop(['machine_id', 'domains'], axis=1).head()

reading from ../output/weeks_machines_domains.csv...
... read 100 rows


Unnamed: 0,week,n_domains
0,1,104
1,1,0
2,1,20
3,1,49
4,1,64


In [12]:
n_domains = 20

In [29]:
starting_domains_list = weeks_machines_domains_df[
    (weeks_machines_domains_df.n_domains == n_domains)
].domains.values[2]
starting_domains_list = list(starting_domains_list)
starting_domains_list

['eastmeeteast.com',
 'imslp.org',
 'dantri.com.vn',
 'worldcat.org',
 'facebook.com',
 'mtac.org',
 'microsoft.com',
 'niceguymistakes.com',
 'pianoteachers.com',
 'msn.com',
 'yahoo.com',
 'yamahamusicacademy.com',
 'google.com',
 'googlesyndication.com',
 'libcal.com',
 'claremont.edu',
 'primamusic.com',
 'vnexpress.net',
 'bing.com',
 'live.com']

In [30]:
# use most frequent domains from sampled rows as other domain samples
other_domains = weeks_machines_domains_df.domains.values
other_domains = [d for domains in other_domains for d in domains if d not in starting_domains_list]
other_domains = pd.Series(other_domains).value_counts().head(20).index
other_domains

Index(['youtube.com', 'amazon.com', '247-inc.net', 'wikipedia.org',
       'walmart.com', 'pornhub.com', 'pinterest.com', 'bestbuy.com',
       'twitter.com', 'craigslist.org', 'capitalone.com', 'chase.com',
       'go.com', 'apple.com', 'ebay.com', 'myway.com', 'coupons.com',
       'steampowered.com', 'paypal.com', 'mplxtms.com'],
      dtype='object')

make a dataframe with columns
```
m, domains, simhash, ot_cohort, cohort_k2, cohort_k4, cohort_k8,
```

where 
- each m differs from previous m by one domain
- ot_cohort generated via OT floc
- cohort_k* generated using our prefixLSH with given k

here the domains gradually transition from some unique user's domains to the most popular

We then expect similar cohort IDs to be clustered near each other in the list.

In [31]:
m = list(range(20))
m

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [32]:
domains = [starting_domains_list]
for i in range(19):
    i1_domains = domains[i][:] # copy domains list from i
    i1_domains[i] = other_domains[i] # alter domains by one entry
    domains += [i1_domains]

In [33]:
test_df = pd.DataFrame({'m':m, 'domains':domains})
test_df['simhash'] = test_df.domains.apply(floc.hashes.sim_hash_string)
test_df['ot_cohort'] = test_df.domains.apply(floc.simulate)
for k in [2, 4, 8]:
    cohort_k = 'cohort_k%s'%k
    cohorts_dict = prefixLSH.get_cohorts_dict(test_df.simhash.astype(int), min_k=k)
    test_df[cohort_k] = test_df.simhash.map(cohorts_dict)
test_df

Unnamed: 0,m,domains,simhash,ot_cohort,cohort_k2,cohort_k4,cohort_k8
0,0,"[eastmeeteast.com, imslp.org, dantri.com.vn, w...",989590657387225,28243,4,2,2
1,1,"[youtube.com, imslp.org, dantri.com.vn, worldc...",993989768727249,28446,4,2,2
2,2,"[youtube.com, amazon.com, dantri.com.vn, world...",431039932698072,11052,2,1,1
3,3,"[youtube.com, amazon.com, 247-inc.net, worldca...",466223243889104,12147,2,1,1
4,4,"[youtube.com, amazon.com, 247-inc.net, wikiped...",187016005717464,4778,1,1,1
5,5,"[youtube.com, amazon.com, 247-inc.net, wikiped...",749931616177624,20507,3,2,2
6,6,"[youtube.com, amazon.com, 247-inc.net, wikiped...",178769543204808,4562,1,1,1
7,7,"[youtube.com, amazon.com, 247-inc.net, wikiped...",143619513425362,3359,1,1,1
8,8,"[youtube.com, amazon.com, 247-inc.net, wikiped...",178254213594832,4553,1,1,1
9,9,"[youtube.com, amazon.com, 247-inc.net, wikiped...",184851115818706,4700,1,1,1
