# Domain rank correlation

## Load data from Gaffer

In [2]:
import requests
import json
import sys
import gaffer
import threatgraph
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
g = threatgraph.Gaffer()
g.use_cert()

In [4]:
op = gaffer.OperationChain([
    gaffer.GetAllElements(entities=[
        gaffer.ViewGroup("device", exclude=["count", "time"])
    ], edges=None),
    gaffer.GetWalks(
        operations=[
            gaffer.OperationChain(operations=[
                gaffer.GetElements(edges=[
                    gaffer.ViewGroup("hasip", exclude=["count", "time"])
                ], entities=None, include="OUTGOING")
            ]), 
            gaffer.OperationChain(operations=[
                gaffer.GetElements(edges=[
                    gaffer.ViewGroup("dnsquery")
                ], entities=None, include="OUTGOING")
            ]), 
             gaffer.OperationChain(operations=[
                gaffer.GetElements(edges=[
                    gaffer.ViewGroup("indomain", exclude=["count", "time"])
                ], entities=None, include="OUTGOING")
            ])
        ],
        limit=10000000)
])

query = op.encode()
res = g.execute_chunked(op)

things = {}
timestamps = {}
counts = {}

all_things = set()
all_devices = set()

for v in res:

    try:
        ent = json.loads(v)

    except Exception:
        continue

    dev = list(ent["entities"][0].keys())[0]
    thing = list(ent["entities"][3].keys())[0]
    
    k = (dev, thing)

    if not k in counts:
        counts[k] = 0
        timestamps[k] = set()
        
    if not dev in things:
        things[dev] = set()

    ts = ent["edges"][1][0]["properties"]["time"]["uk.gov.gchq.gaffer.time.RBMBackedTimestampSet"]["timestamps"]
    cnt = ent["edges"][1][0]["properties"]["count"]

    counts[k] += cnt
    timestamps[k].update(ts)
    things[dev].add(thing)
    all_things.add(thing)
    all_devices.add(dev)

In [5]:
all_devices = list(all_devices)
all_things = list(all_things)

## Construct heatmap matrix

In [95]:
def get_counts(thing, dev):
    if thing in things[dev]: return counts[(dev, thing)]
    return 0

datas = [ [ get_counts(thing, dev) for dev in all_devices ] for thing in all_things]
df = pd.DataFrame(datas, index=all_things, columns=all_devices)
df["score"] = df.sum(axis=1)
df

Unnamed: 0,minesweepers-mac,misskitty-mac,calcannea-mac,turing-macbook,simon-macpro,turing-minimacbook,MalcomWare-PC,LAPTOP-1I501C4U,ludicrous-mac,elysium-mac,...,daniel-chromebook,rarb-mac,dgmac,theatergoing-mac,DESKTOP-PIMD8C0,castle3b7c9f,nervings-cb,turing-chromebook,serotinal-mac,score
satis.fi,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
presscdn.com,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
wikimedia.org,8,86,0,0,6,6,0,3,0,9,...,0,3,0,1,3,0,0,4,9,287
doi.org,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,18
creditcards.org,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
ayads.co,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45
phiphiohara.com,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
cloudendure.com,0,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
smallbusiness.co.uk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
official-coupons.com,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [96]:
n = 100
tops = 10000
top_domains = df.sort_values(by="score", ascending=False).index.values[:tops]
top_domains

array(['google.com', 'gstatic.com', 'trustnetworks.com', ...,
       'eternallifestyle.com', 'amazonbusiness.com', 'newsinc.com'],
      dtype=object)

In [97]:
df.sort_values(by="mark-mac", ascending=False)[["mark-mac", "score"]][:n]["mark-mac"].index.values

array(['facebook.com', 'virustotal.com', 'google.com', 'twisted.ru',
       'bbc.co.uk', 'trustnetworks.com', 'gstatic.com',
       'fedoraproject.org', 'apple.com', 'clubhouse.io', 'in-addr.arpa',
       'bbci.co.uk', 'googleapis.com', 'apility.net', 'doubleclick.net',
       'slack.com', 'google.co.uk', 'googleusercontent.com', 'icloud.com',
       'github.com', 'intercom.io', 'connatix.com', 'voipfone.net',
       'githubusercontent.com', 'google-analytics.com', 'googlevideo.com',
       'doubleverify.com', 'googlesyndication.com', 'slack-edge.com',
       'cloudfront.net', 'apple-dns.net', 'scorecardresearch.com',
       'dynatrace.com', 'twitter.com', 'adnxs.com', 'youtube.com',
       'amazonaws.com', 'ytimg.com', 'chartbeat.net', 'amazon.com',
       'adsafeprotected.com', 'contextweb.com', 'nationalrail.co.uk',
       'bbc.com', 'rubiconproject.com', 'slashdot.org', 'advertising.com',
       'consensu.org', 'fbcdn.net', 'slack-msgs.com', 'tunnelblick.net',
       'twimg.com', '

In [98]:
fav_domains = [ [v, df.sort_values(by=v, ascending=False)[[v, "score"]][:n][v].index.values] for v in all_devices ]
fav_domains

[['minesweepers-mac',
  array(['google.com', 'github.com', 'in-addr.arpa', 'apple.com',
         'clubhouse.io', 'gstatic.com', 'googleapis.com', 'akadns.net',
         'slack.com', 'doubleclick.net', 'google.co.uk',
         'googleusercontent.com', 'icloud.com', 'amazon.com',
         'utun1.viscosity', 'minesweepers.local', 'githubusercontent.com',
         'amazonaws.com', 'slack-edge.com', 'akadns6.net', 'gravatar.com',
         'trafficmanager.net', 'akamaiedge.net', 'microsoft.com',
         'hubspot.com', 'skype.com', 'trustnetworks.com', 'fastly.net',
         'bbc.co.uk', 'facebook.com', 'apple-dns.net',
         'google-analytics.com', 'vsassets.io', 'slack-imgs.com',
         'segment.io', 'visualstudio.com', 'cloudapp.net',
         'azurewebsites.net', 'pndsn.com', 'slack-msgs.com',
         'googlesyndication.com', 'npmjs.org', 'twitter.com',
         'cloudfront.net', 'bootstrapcdn.com', 'dialpad.com', 'clamxav.com',
         '1password.com', 'msecnd.net', 'adsafeprotec

In [99]:
df2 = pd.DataFrame(fav_domains, columns=["device", "domains"]).set_index("device")
df2

Unnamed: 0_level_0,domains
device,Unnamed: 1_level_1
minesweepers-mac,"[google.com, github.com, in-addr.arpa, apple.c..."
misskitty-mac,"[google.com, akadns.net, googleapis.com, gstat..."
calcannea-mac,"[google.com, googleapis.com, slack.com, crashl..."
turing-macbook,"[google.com, apple.com, in-addr.arpa, akadns.n..."
simon-macpro,"[google.com, gstatic.com, akadns.net, apple.co..."
turing-minimacbook,"[google.com, apple.com, akadns.net, in-addr.ar..."
MalcomWare-PC,"[ddns.net, trustnetworks.com, microsoft.com, g..."
LAPTOP-1I501C4U,"[microsoft.com, trustnetworks.com, mcafee.com,..."
ludicrous-mac,"[in-addr.arpa, google.com, apple.com, utun1.vi..."
elysium-mac,"[google.com, gstatic.com, googleapis.com, goog..."


In [100]:
top_map = {top_domains[i]: i for i in range(0, len(top_domains)) }
top_map

{'google.com': 0,
 'gstatic.com': 1,
 'trustnetworks.com': 2,
 'googleapis.com': 3,
 'apple.com': 4,
 'in-addr.arpa': 5,
 'clubhouse.io': 6,
 'akadns.net': 7,
 'doubleclick.net': 8,
 'netgear.com': 9,
 'slack.com': 10,
 'amazonaws.com': 11,
 'googleusercontent.com': 12,
 'github.com': 13,
 'facebook.com': 14,
 'akamaiedge.net': 15,
 'google.co.uk': 16,
 'icloud.com': 17,
 'microsoft.com': 18,
 'apple-dns.net': 19,
 'virustotal.com': 20,
 'amazon.com': 21,
 'rbxcdn.com': 22,
 'cloudapp.net': 23,
 'utun1.viscosity': 24,
 'codeanywhere.com': 25,
 'utun2.viscosity': 26,
 'cloudfront.net': 27,
 'google-analytics.com': 28,
 'adnxs.com': 29,
 'bbc.co.uk': 30,
 'ea.com': 31,
 'adsafeprotected.com': 32,
 'slack-edge.com': 33,
 'skype.com': 34,
 'githubusercontent.com': 35,
 'intercom.io': 36,
 'linkedin.com': 37,
 'googlesyndication.com': 38,
 'hubspot.com': 39,
 'slack-msgs.com': 40,
 'fastly.net': 41,
 'ddns.net': 42,
 'bbci.co.uk': 43,
 'trafficmanager.net': 44,
 'pubmatic.com': 45,
 'double

In [101]:
get_rank = lambda d: top_map[d] if d in top_map else 10000

In [102]:
[ get_rank(v) for v in ["google.com", "apple.com", "not.foiund"] ]

[0, 4, 10000]

In [103]:
df2["rank"] = df2["domains"].apply(lambda x: [ get_rank(v) for  v in x ])

In [105]:
df2

Unnamed: 0_level_0,domains,rank
device,Unnamed: 1_level_1,Unnamed: 2_level_1
minesweepers-mac,"[google.com, github.com, in-addr.arpa, apple.c...","[0, 13, 5, 4, 6, 1, 3, 7, 10, 8, 16, 12, 17, 2..."
misskitty-mac,"[google.com, akadns.net, googleapis.com, gstat...","[0, 7, 3, 1, 4, 13, 23, 8, 11, 15, 5, 34, 16, ..."
calcannea-mac,"[google.com, googleapis.com, slack.com, crashl...","[0, 3, 10, 142, 50, 30, 43, 485, 40, 33, 532, ..."
turing-macbook,"[google.com, apple.com, in-addr.arpa, akadns.n...","[0, 4, 5, 7, 390, 121, 3, 1, 15, 65, 30, 12, 1..."
simon-macpro,"[google.com, gstatic.com, akadns.net, apple.co...","[0, 1, 7, 4, 3, 19, 12, 5, 10, 37, 17, 15, 277..."
turing-minimacbook,"[google.com, apple.com, akadns.net, in-addr.ar...","[0, 4, 7, 5, 19, 1, 3, 12, 15, 17, 24, 14, 65,..."
MalcomWare-PC,"[ddns.net, trustnetworks.com, microsoft.com, g...","[42, 2, 18, 0, 3, 321, 1, 84, 562, 8, 1181, 57..."
LAPTOP-1I501C4U,"[microsoft.com, trustnetworks.com, mcafee.com,...","[18, 2, 86, 126, 0, 215, 1, 8, 3, 5, 290, 116,..."
ludicrous-mac,"[in-addr.arpa, google.com, apple.com, utun1.vi...","[5, 0, 4, 24, 3, 1, 66, 17, 10, 7, 84, 12, 33,..."
elysium-mac,"[google.com, gstatic.com, googleapis.com, goog...","[0, 1, 3, 12, 50, 56, 4, 5, 7, 10, 37, 93, 87,..."


In [106]:
def coefficient(x):
    tot = 0
    for i in range(0, n):
        tot += (x[i] - i) * (x[i] - i)
    return 1.0 - ( (6 * tot) / ( n * (n * n - 1) ) )

df2["coefficient"] = df2["rank"].apply(coefficient).round(3)

In [108]:
df2.sort_values(by="coefficient", ascending=False)[["domains", "coefficient"]]

Unnamed: 0_level_0,domains,coefficient
device,Unnamed: 1_level_1,Unnamed: 2_level_1
minesweepers-mac,"[google.com, github.com, in-addr.arpa, apple.c...",-1.297
misskitty-mac,"[google.com, akadns.net, googleapis.com, gstat...",-1.399
terrestrial-mac,"[google.com, gstatic.com, apple.com, googleapi...",-2.356
steamverne-mac,"[google.com, doubleclick.net, gstatic.com, goo...",-3.062
boundless-mac,"[google.com, in-addr.arpa, apple.com, clubhous...",-7.679
dramatic-mac,"[google.com, apple.com, rbxcdn.com, in-addr.ar...",-13.029
languid-mac,"[google.com, in-addr.arpa, apple.com, gstatic....",-16.286
elysium-mac,"[google.com, gstatic.com, googleapis.com, goog...",-16.684
nervings-cb,"[google.com, clubhouse.io, googleapis.com, gst...",-22.406
serotinal-mac,"[google.com, googleapis.com, in-addr.arpa, gst...",-31.695
