## Merge database tables into a flat dataframe

In [1]:
import sqlite3
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
pd.set_option("display.max_colwidth", 60)
pd.set_option("display.max_rows", 100)

In [2]:
RELEASE_DB_NAME = "/scratch/arjunth2001/princeton/release_db.sqlite"
conn = sqlite3.connect(RELEASE_DB_NAME)

### Read in the tables

In [3]:
# we'll merge everything into this dataframe
df = pd.read_sql_query("SELECT * FROM policy_snapshots", conn)

sites_df = pd.read_sql_query("SELECT * FROM sites", conn)
policy_texts_df = pd.read_sql_query("SELECT * FROM policy_texts", conn)
alexa_ranks_df = pd.read_sql_query("SELECT * FROM alexa_ranks", conn)

In [4]:
# Left join with policy text table
df = pd.merge(df, policy_texts_df, how="left", left_on="policy_text_id", right_on="id")

# Left join with sites table
df = pd.merge(df, sites_df, how="left", left_on="site_id", right_on="id")

# Left join with alexa ranks table
df = pd.merge(df, alexa_ranks_df, how="left", on=['site_id', 'year', 'phase'])

In [5]:
len(df)

1071487

In [6]:
df.head(3)

Unnamed: 0,id_x,site_id,homepage_snapshot_url,policy_snapshot_url,policy_url,homepage_snapshot_redirected_url,year,phase,policy_text_id,policy_html_id,...,flesch_kincaid,smog,flesch_ease,length,sha1,simhash,id,domain,categories,rank
0,1,31160,https://web.archive.org/web/20031122123614id_/http%3A//d...,https://web.archive.org/web/20031207185149id_/http%3A//w...,docusign.com/eDocuSign/privacy.asp,https://web.archive.org/web/20031122123614id_/http://www...,2003,B,1,1,...,14.938384,18.351184,difficult,14137,37519a313f07f98fdbb51f13779ed7d6cb1b4468,862964595037544195,31160,docusign.com,business;informationtech,
1,2,31160,https://web.archive.org/web/20040924214154id_/http%3A//d...,https://web.archive.org/web/20040816044548id_/http%3A//w...,docusign.com/privacy.asp,https://web.archive.org/web/20041014005441id_/http://www...,2004,B,2,2,...,13.762202,18.17146,difficult,14366,837c2e1fdab142cebb3cc008880309ee94a6ecbf,863527407568723719,31160,docusign.com,business;informationtech,
2,3,31160,https://web.archive.org/web/20061026200451id_/http%3A//d...,https://web.archive.org/web/20061219215634id_/http%3A//w...,docusign.com/resources/privacy_policy.php,https://web.archive.org/web/20061026200451id_/http://www...,2006,B,3,3,...,13.640407,18.062587,difficult,14363,9296f990b576e40dec5d0e54552918247192bd8f,4321728971435908871,31160,docusign.com,business;informationtech,


In [7]:
years = list(df["year"].unique())

In [8]:
labels=['First Party Collection/Use',
          'Third Party Sharing/Collection',
          'Other',
          'International and Specific Audiences',
          'Data Security',
          'User Choice/Control',
          'User Access, Edit and Deletion',
          'Data Retention',
          'Policy Change',
          'Do Not Track']

In [9]:
count = {y: {l:0 for l in labels} for y in years[:len(years)-3]}     

In [10]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

In [11]:
name = "arjunth2001/priv_ftc"
model = AutoModelForSequenceClassification.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)

In [12]:
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer,device=0)

In [13]:
def get_preds(y,x):
    body = [i for i in x.split("\n") if len(i)!=0 and i!="" and len(i.split())>=10]
    global count
    try:
        preds = list(set(i["label"] for i in pipe(body)))
        for p in preds:
            count[y][p]+=1
        return preds
    except:
        return []

In [14]:
for y in years[:len(years)-3]:
    year_df = df[df["year"]==y].reset_index()
    samples = year_df[year_df["phase"]=="B"]
    samples = samples.sample(1000).reset_index()
    samples["policy_text"].progress_apply(lambda x:get_preds(y,x))

  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))
Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


  0%|          | 0/10 [00:00<?, ?it/s]

  cpuset_checked))


In [32]:
perc= {y:{l: (count[y][l]/1000)*100 for l in labels} for y in years[:len(years)-3]}

In [33]:
print(perc[2002]["International and Specific Audiences"])

20.0

In [36]:
data = {str(k):v for k,v in perc.items()}

In [37]:
with open("data_analysis.json","w") as f:
    import json
    json.dump(data,f, indent=4)