# Correlation between different threat exchange scores

Correlation between VirusTotal and Facebook scores.

## Setup and load data

In [1]:
import pickle
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import h2o

In [2]:
df = pickle.load(open("vector.DOMAINS", "rb"))

## Aggregation across device

In [3]:
# Construct lists of column names
apility_cols = [v for v in df.columns if v.startswith("apility.")]
facebook_cols = [v for v in df.columns if v.startswith("facebook.")]
vt_cols = [v for v in df.columns if v.startswith("vt.")]

In [4]:
# Probability aggregation function
prob_agg = lambda a: 1.0 - np.prod([1.0 - v for v in a])

In [5]:
dfs = df.copy()
dfs["apility"] = dfs[apility_cols].apply(prob_agg, axis=1)
dfs["facebook"] = dfs[facebook_cols].apply(prob_agg, axis=1)
dfs["vt"] = dfs[vt_cols].apply(prob_agg, axis=1)

In [6]:
grouped = dfs[["apility", "facebook", "vt"]].groupby("device")
agg = grouped.agg(
    {
        "apility": prob_agg, "facebook": prob_agg, "vt": prob_agg
    }
)
agg["score"] = agg["apility"] + agg["facebook"] + agg["vt"]
agg

Unnamed: 0_level_0,apility,facebook,vt,score
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DESKTOP-PIMD8C0,0.044395,0.215506,0.179662,0.439563
LAPTOP-1I501C4U,0.044395,0.215506,0.179662,0.439563
MalcomWare-PC,0.008183,0.11053,0.063514,0.182227
abroad-android,0.132108,0.30148,0.075954,0.509542
boundless-mac,0.031767,0.630564,0.403379,1.065711
calcannea-cb,0.015724,0.550683,0.267896,0.834303
calcannea-mac,0.01976,0.089339,0.228773,0.337871
castle3b7c9f,0.0,0.0,0.0,0.0
daniel-chromebook,0.028562,0.022497,0.052564,0.103623
dgmac,0.0041,0.083511,0.191341,0.278952


## Cubic regression

In [7]:
model = np.polyfit(x=agg["facebook"], y=agg["vt"], deg=3)
model

array([ 2.01185257, -2.80127537,  1.81011418, -0.0202086 ])

In [8]:
predict = lambda x: model[0] * x ** 3 + model[1] * x ** 2 + model[2] * x + model[3]
agg["vt prediction"] = predict(agg["vt"])

In [9]:
agg["error"] = abs(agg["vt"] - agg["vt prediction"])
agg

Unnamed: 0_level_0,apility,facebook,vt,score,vt prediction,error
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DESKTOP-PIMD8C0,0.044395,0.215506,0.179662,0.439563,0.226247,0.046585
LAPTOP-1I501C4U,0.044395,0.215506,0.179662,0.439563,0.226247,0.046585
MalcomWare-PC,0.008183,0.11053,0.063514,0.182227,0.083974,0.02046
abroad-android,0.132108,0.30148,0.075954,0.509542,0.101998,0.026044
boundless-mac,0.031767,0.630564,0.403379,1.065711,0.386195,0.017185
calcannea-cb,0.015724,0.550683,0.267896,0.834303,0.302352,0.034456
calcannea-mac,0.01976,0.089339,0.228773,0.337871,0.271374,0.042602
castle3b7c9f,0.0,0.0,0.0,0.0,-0.020209,0.020209
daniel-chromebook,0.028562,0.022497,0.052564,0.103623,0.067491,0.014927
dgmac,0.0041,0.083511,0.191341,0.278952,0.237675,0.046334


In [10]:
math.sqrt(agg["error"].apply(lambda x: x**2).mean())

0.06361987662767067

## GBM

### Train model

In [11]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,16 hours 56 mins
H2O cluster timezone:,Europe/London
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.8
H2O cluster version age:,2 months and 15 days
H2O cluster name:,H2O_from_python_mark_1z6qrw
H2O cluster total nodes:,1
H2O cluster free memory:,1.671 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [12]:
hf = h2o.H2OFrame(dfs.reset_index())

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [13]:
y_label = "vt"
x_label = ["facebook"]

In [14]:
model = h2o.estimators.gbm.H2OGradientBoostingEstimator(
    model_id="gbm"
)

In [15]:
train = hf

In [16]:
train, test, valid = hf.split_frame([0.7, 0.15])

In [17]:
model.train(x=x_label, y=y_label, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.00025806909879748407
RMSE: 0.016064529211822054
MAE: 0.0017217603689836814
RMSLE: 0.013225872683454471
Mean Residual Deviance: 0.00025806909879748407

ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 0.0001724584348508872
RMSE: 0.013132343083048326
MAE: 0.0018074714008825103
RMSLE: 0.011347990499572295
Mean Residual Deviance: 0.0001724584348508872
Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2018-07-05 10:59:06,0.001 sec,0.0,0.0166136,0.0018024,0.0002760,0.0125829,0.0018664,0.0001583
,2018-07-05 10:59:06,0.074 sec,1.0,0.0165106,0.0017924,0.0002726,0.0125872,0.0018594,0.0001584
,2018-07-05 10:59:06,0.082 sec,2.0,0.0164267,0.0017837,0.0002698,0.0126013,0.0018530,0.0001588
,2018-07-05 10:59:06,0.088 sec,3.0,0.0163585,0.0017759,0.0002676,0.0126223,0.0018473,0.0001593
,2018-07-05 10:59:06,0.094 sec,4.0,0.0163030,0.0017692,0.0002658,0.0126478,0.0018423,0.0001600
---,---,---,---,---,---,---,---,---,---
,2018-07-05 10:59:06,0.294 sec,46.0,0.0160645,0.0017218,0.0002581,0.0131323,0.0018075,0.0001725
,2018-07-05 10:59:06,0.298 sec,47.0,0.0160645,0.0017218,0.0002581,0.0131323,0.0018075,0.0001725
,2018-07-05 10:59:06,0.301 sec,48.0,0.0160645,0.0017218,0.0002581,0.0131323,0.0018075,0.0001725



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
facebook,1.0612648,1.0,1.0




### Prediction error on test data

In [19]:
results_hf = model.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [20]:
results = results_hf.as_data_frame(use_pandas=True)

In [21]:
test = test.as_data_frame(use_pandas=True)

In [22]:
test["vt prediction"] = results

In [23]:
test["error"] = abs(test["vt"] - test["vt prediction"])

In [24]:
test["error"].max(), test["error"].mean()

(0.44099026020154, 0.0014877228358988845)

This table lists samples where the VirusTotal score prediction from Facebook score is above 0.05 error.

In [25]:
test[test["error"] > 0.05][["device", "domain", "count", "apility", "facebook", "vt", "vt prediction", "error"]]

Unnamed: 0,device,domain,count,apility,facebook,vt,vt prediction,error
269,misskitty-mac,github.com,4771,0.0,0.0,0.061239,0.000679,0.060561
366,misskitty-mac,bitbucket.org,54,0.0,0.0,0.078819,0.000679,0.078141
378,misskitty-mac,adadvisor.net,8,0.0,0.03616,0.0,0.077182,0.077182
562,nervings-cb,github.com,301,0.0,0.0,0.061239,0.000679,0.060561
586,nervings-cb,t.co,2,0.0,0.0,0.105826,0.000679,0.105147
787,serotinal-mac,github.com,2,0.0,0.0,0.061239,0.000679,0.060561
1042,calcannea-mac,t.co,2,0.0,0.0,0.105825,0.000679,0.105147
1076,dgmac,github.com,62,0.0,0.0,0.061239,0.000679,0.06056
1188,boundless-mac,amzn.to,1,0.0,0.00123,0.092697,0.001071,0.091626
1476,elysium-mac,glancecdn.net,2,0.0,0.03616,0.0,0.077182,0.077182
