# Domain matches BN

## Load data from Gaffer

In [1]:
import requests
import json
import sys
import gaffer
import threatgraph
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
import csv

In [2]:
g = threatgraph.Gaffer()
g.use_cert()

### Matches

In [3]:
op = gaffer.GetAllElements(entities=None, edges=["matches"])
r = g.execute_chunked(op)

matches = {}

for v in r:

    try:
        obj = json.loads(v)
    except:
        # Ignore lines which don't parse as JSON
        continue

    src = obj["source"]
    dest = obj["destination"]
    
    if not src in matches:
        matches[src] = []
    matches[src].append({
        "blacklist": dest,
        "latest": obj["properties"]["time"]["uk.gov.gchq.gaffer.time.RBMBackedTimestampSet"]["latest"]
    })

r.close()

matches

{'64.233.181.94': [{'blacklist': 'vt.0029b60007302b92917a32341c63c084256854546cca0c4ac7fc394839d3a1c5',
   'latest': 1528614000.0},
  {'blacklist': 'vt.04846ceeab4303097395e1be24b72f8d26d8784b6065fff98b5d54bf3d023542',
   'latest': 1528578000.0},
  {'blacklist': 'vt.0eca041f195b643c07da8e56dbf4b7b5ff80188f29f619ec062db1b24e27009e',
   'latest': 1527984000.0},
  {'blacklist': 'vt.1382c3cd4ca1fb978233cf8178c98235ef2920ae87adb7ad8f19ab7db28bee25',
   'latest': 1528484400.0},
  {'blacklist': 'vt.1781d5ed9e93239e48765d8966bb5dcaf324a24f7b106e6c66718d15ba5c8dc3',
   'latest': 1528567200.0},
  {'blacklist': 'vt.1b141a4e88ebb88f1f7648a6dcf0574fae206c00e20536c0280d5f46c31fbbc8',
   'latest': 1528570800.0},
  {'blacklist': 'vt.1d3cbfca370f8bbc2b33f6317c03fab0d559698ed1af90955219048774db62cb',
   'latest': 1528473600.0},
  {'blacklist': 'vt.1d920e934f34e1ee8ef1102a859a125514fe3707bf4a1516236587772a465df4',
   'latest': 1528578000.0},
  {'blacklist': 'vt.1de851b56e40b8b731aeaeaf3b4251ce3145f379d3b

### Domains

In [4]:
op = gaffer.OperationChain([
    gaffer.GetAllElements(entities=[
        gaffer.ViewGroup("device", exclude=["count", "time"])
    ], edges=None),
    gaffer.GetWalks(
        operations=[
            gaffer.OperationChain(operations=[
                gaffer.GetElements(edges=[
                    gaffer.ViewGroup("hasip", exclude=["count", "time"])
                ], entities=None, include="OUTGOING")
            ]), 
            gaffer.OperationChain(operations=[
                gaffer.GetElements(edges=[
                    gaffer.ViewGroup("dnsquery")
                ], entities=None, include="OUTGOING")
            ]), 
             gaffer.OperationChain(operations=[
                gaffer.GetElements(edges=[
                    gaffer.ViewGroup("indomain", exclude=["count", "time"])
                ], entities=None, include="OUTGOING")
            ])
        ],
        limit=10000000)
])

query = op.encode()
res = g.execute_chunked(op)

things = {}
timestamps = {}
counts = {}

all_things = set()
all_devices = set()
all_timestamps = set()

for v in res:

    try:
        ent = json.loads(v)

    except Exception:
        continue

    dev = list(ent["entities"][0].keys())[0]
    thing = list(ent["entities"][3].keys())[0]
    
    k = (dev, thing)

    if not k in counts:
        counts[k] = 0
        timestamps[k] = set()
        
    if not dev in things:
        things[dev] = set()

    ts = ent["edges"][1][0]["properties"]["time"]["uk.gov.gchq.gaffer.time.RBMBackedTimestampSet"]["timestamps"]
    cnt = ent["edges"][1][0]["properties"]["count"]

    counts[k] += cnt
    timestamps[k].update(ts)
    things[dev].add(thing)
    all_things.add(thing)
    all_devices.add(dev)
    all_timestamps.update(ts)
    
res.close()

In [5]:
print(len(all_timestamps))

908


In [6]:
all_devices = list(all_devices)
all_things = list(all_things)

## Top domains

In [7]:
def get_counts(thing, dev):
    if thing in things[dev]: return counts[(dev, thing)]
    return 0

datas = [ [ get_counts(thing, dev) for dev in all_devices ] for thing in all_things]
df = pd.DataFrame(datas, index=all_things, columns=all_devices)

# Add a score column
df["score"] = df.sum(axis=1)
df

Unnamed: 0,theatergoing-mac,appropriate-android,dgmac,tirasse-mac,calcannea-mac,simon-macpro,abroad-android,raymond-mac,venkata,rarb-mac,...,nervings-cb,essential-chrome,languid-mac,minesweepers-mac,ludicrous-mac,mark-mac,MalcomWare-PC,turing-minimacbook,LAPTOP-1I501C4U,score
eff.org,0,0,0,0,0,0,0,0,0,2,...,0,0,0,14,0,0,0,0,0,75
leadmon.net,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
desk.com,0,0,0,0,0,0,0,1,0,0,...,0,0,8,0,0,0,0,0,0,9
digits.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
gettally.com,1,0,0,0,0,0,0,0,0,0,...,0,0,5,1,1,0,0,3,0,83
ipfind.co,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
vidiom.net,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,5
fox5vegas.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
penton.com,0,0,0,0,0,0,0,0,0,0,...,0,2,0,2,0,0,0,0,0,6
kaspersky.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
num_domains = 1000
top_domains = df.sort_values(by="score", ascending=False).index.values[:num_domains]
top_domains

array(['google.com', 'gstatic.com', 'googleapis.com', 'trustnetworks.com',
       'apple.com', 'in-addr.arpa', 'clubhouse.io', 'doubleclick.net',
       'akadns.net', 'netgear.com', 'slack.com', 'amazonaws.com',
       'googleusercontent.com', 'github.com', 'facebook.com',
       'akamaiedge.net', 'microsoft.com', 'google.co.uk', 'icloud.com',
       'apple-dns.net', 'amazon.com', 'fedoraproject.org',
       'virustotal.com', 'cloudapp.net', 'cloudfront.net', 'bbc.co.uk',
       'google-analytics.com', 'utun1.viscosity', 'codeanywhere.com',
       'rbxcdn.com', 'adnxs.com', 'adsafeprotected.com',
       'utun2.viscosity', 'ea.com', 'skype.com', 'slack-edge.com',
       'intercom.io', 'githubusercontent.com', 'googlesyndication.com',
       'hubspot.com', 'linkedin.com', 'fastly.net', 'bbci.co.uk',
       'slack-msgs.com', 'imrworldwide.com', 'cymru.com', 'ddns.net',
       'twitter.com', 'pubmatic.com', 'trafficmanager.net',
       'gravatar.com', 'doubleverify.com', 'quantserve.com',


## Selected domains

In [9]:
selected=top_domains[:1000]
selected

array(['google.com', 'gstatic.com', 'googleapis.com', 'trustnetworks.com',
       'apple.com', 'in-addr.arpa', 'clubhouse.io', 'doubleclick.net',
       'akadns.net', 'netgear.com', 'slack.com', 'amazonaws.com',
       'googleusercontent.com', 'github.com', 'facebook.com',
       'akamaiedge.net', 'microsoft.com', 'google.co.uk', 'icloud.com',
       'apple-dns.net', 'amazon.com', 'fedoraproject.org',
       'virustotal.com', 'cloudapp.net', 'cloudfront.net', 'bbc.co.uk',
       'google-analytics.com', 'utun1.viscosity', 'codeanywhere.com',
       'rbxcdn.com', 'adnxs.com', 'adsafeprotected.com',
       'utun2.viscosity', 'ea.com', 'skype.com', 'slack-edge.com',
       'intercom.io', 'githubusercontent.com', 'googlesyndication.com',
       'hubspot.com', 'linkedin.com', 'fastly.net', 'bbci.co.uk',
       'slack-msgs.com', 'imrworldwide.com', 'cymru.com', 'ddns.net',
       'twitter.com', 'pubmatic.com', 'trafficmanager.net',
       'gravatar.com', 'doubleverify.com', 'quantserve.com',


## Rework as matrix

In [10]:
datas = []

for ts in all_timestamps:
    for dev in all_devices:
        row = [dev, ts]
        for thing in selected:
            val = False
            k = (dev, thing)
            if k in timestamps:
                if ts in timestamps[k]:
                    val = True
            row.append(val)
        datas.append(row)

In [11]:
cols = ["device", "time"]
cols.extend(selected)
cols

['device',
 'time',
 'google.com',
 'gstatic.com',
 'googleapis.com',
 'trustnetworks.com',
 'apple.com',
 'in-addr.arpa',
 'clubhouse.io',
 'doubleclick.net',
 'akadns.net',
 'netgear.com',
 'slack.com',
 'amazonaws.com',
 'googleusercontent.com',
 'github.com',
 'facebook.com',
 'akamaiedge.net',
 'microsoft.com',
 'google.co.uk',
 'icloud.com',
 'apple-dns.net',
 'amazon.com',
 'fedoraproject.org',
 'virustotal.com',
 'cloudapp.net',
 'cloudfront.net',
 'bbc.co.uk',
 'google-analytics.com',
 'utun1.viscosity',
 'codeanywhere.com',
 'rbxcdn.com',
 'adnxs.com',
 'adsafeprotected.com',
 'utun2.viscosity',
 'ea.com',
 'skype.com',
 'slack-edge.com',
 'intercom.io',
 'githubusercontent.com',
 'googlesyndication.com',
 'hubspot.com',
 'linkedin.com',
 'fastly.net',
 'bbci.co.uk',
 'slack-msgs.com',
 'imrworldwide.com',
 'cymru.com',
 'ddns.net',
 'twitter.com',
 'pubmatic.com',
 'trafficmanager.net',
 'gravatar.com',
 'doubleverify.com',
 'quantserve.com',
 'azureedge.net',
 'origin.com',

In [12]:
df = pd.DataFrame(datas, columns = cols)
df = df.set_index(["device", "time"])

In [13]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,google.com,gstatic.com,googleapis.com,trustnetworks.com,apple.com,in-addr.arpa,clubhouse.io,doubleclick.net,akadns.net,netgear.com,...,appnexusgslb.net,waitrosecellar.com,ripe.net,alerta.io,robtex.com,staticflickr.com,docusign.net,yotpo.com,yandex.ru,ably.io
device,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
theatergoing-mac,1.531238e+09,True,True,True,True,True,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
appropriate-android,1.531238e+09,True,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
dgmac,1.531238e+09,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tirasse-mac,1.531238e+09,True,True,True,True,True,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
calcannea-mac,1.531238e+09,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
simon-macpro,1.531238e+09,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
abroad-android,1.531238e+09,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
raymond-mac,1.531238e+09,True,True,True,False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
venkata,1.531238e+09,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
rarb-mac,1.531238e+09,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
csvcols = list(df.index.names) + list(df.columns)
csvmap = {csvcols[v]: v for v in range(len(csvcols))}

with open('domains.csv', 'w') as csvfile:
    csvwriter = csv.DictWriter(csvfile, fieldnames=csvcols)
    csvwriter.writeheader()
    for ix, row in df.iterrows():
        row = [1 if v else 0 for v in list(row)]
        r = {}
        for i in range(len(list(df.index.names))):
            r[df.index.names[i]] = ix[i]
        for i in range(len(list(row))):
            r[df.columns[i]] = row[i]
        csvwriter.writerow(r)
            

In [15]:
csvmap

{'device': 0,
 'time': 1,
 'google.com': 2,
 'gstatic.com': 3,
 'googleapis.com': 4,
 'trustnetworks.com': 5,
 'apple.com': 6,
 'in-addr.arpa': 7,
 'clubhouse.io': 8,
 'doubleclick.net': 9,
 'akadns.net': 10,
 'netgear.com': 11,
 'slack.com': 12,
 'amazonaws.com': 13,
 'googleusercontent.com': 14,
 'github.com': 15,
 'facebook.com': 16,
 'akamaiedge.net': 17,
 'microsoft.com': 18,
 'google.co.uk': 19,
 'icloud.com': 20,
 'apple-dns.net': 21,
 'amazon.com': 22,
 'fedoraproject.org': 23,
 'virustotal.com': 24,
 'cloudapp.net': 25,
 'cloudfront.net': 26,
 'bbc.co.uk': 27,
 'google-analytics.com': 28,
 'utun1.viscosity': 29,
 'codeanywhere.com': 30,
 'rbxcdn.com': 31,
 'adnxs.com': 32,
 'adsafeprotected.com': 33,
 'utun2.viscosity': 34,
 'ea.com': 35,
 'skype.com': 36,
 'slack-edge.com': 37,
 'intercom.io': 38,
 'githubusercontent.com': 39,
 'googlesyndication.com': 40,
 'hubspot.com': 41,
 'linkedin.com': 42,
 'fastly.net': 43,
 'bbci.co.uk': 44,
 'slack-msgs.com': 45,
 'imrworldwide.com'