In [98]:
import json
import os
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
import tldextract
from adblockparser import AdblockRules
import pprint

# Har to Json (Part 1)

In [99]:
dir_LTE = './10site_LTE/'
dir_WiFi = './10site_WiFi/'

In [100]:
def create_json_from_har(dir):
  entries = os.listdir(dir)
  connection_type = 'LTE' if 'LTE' in dir else 'wifi'
  for entry in entries:
    if '.har' in entry:
      json_file_name = str(entry).replace('.har','.json')
      print (json_file_name)
#       dir_name = './10site_LTE/' if 'LTE' in dir else './10site_WiFi/'
      dir_name = dir
      os.system(('mkdir {}json').format(dir_name))
      os.system(("pagexray --pretty --includeAssets {}{} > {}json/{}").format(dir_name,entry,dir_name,json_file_name))

        
def remove_brackets_from_json(dir):
  entries = os.listdir(dir)
  # print entries
  for entry in entries:
    with open(dir+'/'+entry) as file:
      file.seek(0,os.SEEK_END)
      print (file.readline())
      file.close()

In [101]:
create_json_from_har(dir_LTE)

m.youtube.com.json
www.amazon.com.json
m.twitch.tv.json
www.walmart.com.json
www.espn.com.json
www.microsoft.com.json
www.chase.com.json
www.netflix.com.json
www.cnn.com.json
www.ebay.com.json


In [102]:
create_json_from_har(dir_WiFi)

m.youtube.com.json
www.amazon.com.json
m.twitch.tv.json
www.walmart.com.json
www.espn.com.json
www.microsoft.com.json
www.chase.com.json
www.netflix.com.json
www.cnn.com.json
www.ebay.com.json


# Tracker Detection (Part 2)

In [103]:
pp = pprint.PrettyPrinter(indent=2)

def mime_type_to_easylist_type(mime_type):
  switcher = {
   "html": "document",
   "application/x-javascript": "script",
   "javascript": "script",
   "css": "stylesheet",
   "image/svg+xml": "image",
   "image/webp": "image",
   "image": "image",
   "image/gif": "image",
   "image/x-icon": "image",
   "image/jpeg": "image",
   "application/font-woff2": "font",
   "font-woff2": "font",
   "video/mp4": "media",
   "favicon":" image",
   "other": "other",
   "plain": "other",
   "json": "other"
  }
  return switcher.get(mime_type, "other")


def load_rules (filename):
  lineList = []
  with open (filename) as f:
    lineList = f.readlines()
  return lineList


def load_scripts_from_json(filename):
  requested_files = []
  with open (filename) as f:
    parsed_json = json.load(f)
#     print ("printing now")
#     print (filename)
#     return
    parsed_json = parsed_json[0]
    f.close()
  for i in parsed_json["assets"]:
    requested_files.append(i)
  return requested_files


def find_tracker_urls(dir): #finds third party tracker urls
  entries = os.listdir(dir)
  wifi_dict = {}
  count = 0
  for entry in entries:
    requested_files = load_scripts_from_json(dir+entry) #actually these are the requested objects
    entry = entry.replace(".json","")
    print (entry)
    wifi_dict[entry] = {}
    wifi_dict[entry]["first_party"] = []
    wifi_dict[entry]["third_party"] = []
    webpage_domain = tldextract.extract(entry).domain
    for i in requested_files:
      url_request_domain = tldextract.extract(i["url"]).domain
      requested_file_type = mime_type_to_easylist_type(i["type"])
      if webpage_domain in url_request_domain:
        options = {'third_party':'False',requested_file_type:'True'}
        if rules.should_block(i["url"],options) == True:
          wifi_dict[entry]["first_party"].append(i["url"])
      else:
        options = {'third_party':'True',requested_file_type:'True'}
        if rules.should_block(i["url"],options) == True:
          wifi_dict[entry]["third_party"].append(i["url"])
  return wifi_dict

def get_numbers(url_dict):
  num_dict = {}
  for url in url_dict:
    num_dict[url] = {}
    num_dict[url]["first_party"] = len(url_dict[url]["first_party"])
    num_dict[url]["third_party"] = len(url_dict[url]["third_party"])
  return num_dict



In [104]:
filename = "easyprivacy.txt"
raw_rules = load_rules(filename)
rules = AdblockRules(raw_rules)

In [105]:
dir_LTE_json = './10site_LTE/json/'
dir_WiFi_json = './10site_WiFi/json/'

In [106]:
total_trackers = find_tracker_urls(dir_LTE_json) #stores complete information about all detected trackers
tracker_numbers = get_numbers(total_trackers) #stores the number of trackers detected per website

output_total_file_name = "total_trackers_LTE.json"
output_number_file_name = "number_trackers_LTE.json"


with open(output_total_file_name, 'w') as fp:
  json.dump(total_trackers , fp,indent = 4)
  fp.close()
with open(output_number_file_name, 'w') as fp:
  json.dump(tracker_numbers, fp,indent = 4)
  fp.close()

m.twitch.tv
www.espn.com
www.chase.com
www.ebay.com
m.youtube.com
www.cnn.com
www.amazon.com
www.walmart.com
www.microsoft.com
www.netflix.com


In [107]:
total_trackers = find_tracker_urls(dir_WiFi_json) #stores complete information about all detected trackers
tracker_numbers = get_numbers(total_trackers) #stores the number of trackers detected per website

output_total_file_name = "total_trackers_WIFI.json"
output_number_file_name = "number_trackers_WIFI.json"


with open(output_total_file_name, 'w') as fp:
  json.dump(total_trackers , fp,indent = 4)
  fp.close()
with open(output_number_file_name, 'w') as fp:
  json.dump(tracker_numbers, fp,indent = 4)
  fp.close()

m.twitch.tv
www.espn.com
www.chase.com
www.ebay.com
m.youtube.com
www.cnn.com
www.amazon.com
www.walmart.com
www.microsoft.com
www.netflix.com


# Plots (Part 3)

In [108]:
websites = []
with open("USA.txt") as file:
    for line in file:
        websites.append(line.replace('\n',''))
print (websites)

['m.twitch.tv', 'm.youtube.com', 'www.amazon.com', 'www.chase.com', 'www.cnn.com', 'www.ebay.com', 'www.espn.com', 'www.microsoft.com', 'www.netflix.com', 'www.walmart.com']


In [109]:
# get lte data
lte_websites = []
data_lte = {}
with open("number_trackers_LTE.json") as file:
    data_lte = json.load(file)
    file.close()
for url in data_lte:
    lte_websites.append(url)
print (data_lte)

{'m.twitch.tv': {'first_party': 0, 'third_party': 2}, 'www.espn.com': {'first_party': 4, 'third_party': 32}, 'www.chase.com': {'first_party': 10, 'third_party': 3}, 'www.ebay.com': {'first_party': 12, 'third_party': 2}, 'm.youtube.com': {'first_party': 1, 'third_party': 0}, 'www.cnn.com': {'first_party': 4, 'third_party': 52}, 'www.amazon.com': {'first_party': 21, 'third_party': 0}, 'www.walmart.com': {'first_party': 17, 'third_party': 17}, 'www.microsoft.com': {'first_party': 11, 'third_party': 12}, 'www.netflix.com': {'first_party': 0, 'third_party': 0}}


In [110]:
# get wifi data
wifi_websites = []
data_wifi = {}
with open("number_trackers_WIFI.json") as file:
    data_wifi = json.load(file)
    file.close()
for url in data_wifi:
    wifi_websites.append(url)
print (data_wifi)


{'m.twitch.tv': {'first_party': 0, 'third_party': 2}, 'www.espn.com': {'first_party': 5, 'third_party': 33}, 'www.chase.com': {'first_party': 10, 'third_party': 3}, 'www.ebay.com': {'first_party': 11, 'third_party': 2}, 'm.youtube.com': {'first_party': 1, 'third_party': 0}, 'www.cnn.com': {'first_party': 3, 'third_party': 38}, 'www.amazon.com': {'first_party': 18, 'third_party': 0}, 'www.walmart.com': {'first_party': 17, 'third_party': 16}, 'www.microsoft.com': {'first_party': 11, 'third_party': 12}, 'www.netflix.com': {'first_party': 0, 'third_party': 0}}


In [111]:
#get common websites
common_websites = list(set(lte_websites).intersection(wifi_websites))


In [112]:
set1 = (1,2,3,4)
print (type(set1))

<class 'tuple'>


In [113]:



N = 5
wifi_trackers = []
lte_trackers = []
for url in common_websites:
    wifi_trackers.append(data_wifi[url]["first_party"])
    lte_trackers.append(data_lte[url]["first_party"])
# print(wifi_trackers, '\n',lte_trackers)


fig = go.Figure(data=[
    go.Bar(name='Wifi', x=common_websites, y=list(wifi_trackers)),
    go.Bar(name='Lte', x=common_websites, y=list(lte_trackers))
], layout=go.Layout(
        title=go.layout.Title(text="First party tracker comparison")
    ))
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
# fig.write_image("First party tracker comparison.png")

In [114]:

N = 5
wifi_trackers = []
lte_trackers = []
for url in common_websites:
    wifi_trackers.append(data_wifi[url]["third_party"])
    lte_trackers.append(data_lte[url]["third_party"])
# print(wifi_trackers, '\n',lte_trackers)


fig = go.Figure(data=[
    go.Bar(name='Wifi', x=common_websites, y=list(wifi_trackers)),
    go.Bar(name='Lte', x=common_websites, y=list(lte_trackers))
], layout=go.Layout(
        title=go.layout.Title(text="Third party tracker comparison")
    ))
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

In [115]:
#finding the common trackers and creating a plot
data_trackers_lte = {}
data_trackers_wifi = {}
data_trackers_common = {}
with open("total_trackers_LTE.json") as file:
    data_trackers_lte = json.load(file)
    file.close()
with open("total_trackers_WIFI.json") as file:
    data_trackers_wifi = json.load(file)
    file.close()
common_trackers_length = {}
for url in common_websites:
    data_trackers_common[url] = set(data_trackers_lte[url]["third_party"]).intersection(data_trackers_wifi[url]["third_party"])
    common_trackers_length[url] = len(data_trackers_common[url])

arr_comm = []
for url in common_trackers_length:
    arr_comm.append(common_trackers_length[url])

fig = go.Figure(
    data=[go.Bar(x=common_websites, y=arr_comm)],
    layout=go.Layout(
        title=go.layout.Title(text="Number of common third party trackers")
    )
)
fig.show()

In [116]:
len(common_websites)

10

In [117]:
lte_only_trackers = {}
wifi_only_trackers = {}
for url in common_websites:
  lte_only_trackers[url] = (set(data_trackers_lte[url]["third_party"]).difference(data_trackers_wifi[url]["third_party"]))
  wifi_only_trackers[url] = set(data_trackers_wifi[url]["third_party"]).difference(data_trackers_lte[url]["third_party"])
# print(len(wifi_only_trackers["www.salesforce.com"]))

In [118]:
wifi_trackers = []
lte_trackers = []
for url in common_websites:
    wifi_trackers.append(len(wifi_only_trackers[url]))
    lte_trackers.append(len(lte_only_trackers[url]))
# print(wifi_trackers, '\n',lte_trackers)


fig = go.Figure(data=[
    go.Bar(name='Wifi only', x=common_websites, y=list(wifi_trackers)),
    go.Bar(name='Lte only', x=common_websites, y=list(lte_trackers)),
    go.Bar(name= 'Common',x=common_websites, y=arr_comm)
], layout=go.Layout(
        title=go.layout.Title(text="Third party tracker comparison")
    ))
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()