# Correlation between different threat exchange scores

Correlation between VirusTotal and Facebook scores.

## Setup and load data

In [1]:
import pickle
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import h2o

In [2]:
df = pickle.load(open("vector.DOMAINS", "rb"))

## Aggregation across device

In [3]:
# Construct lists of column names
apility_cols = [v for v in df.columns if v.startswith("apility.")]
facebook_cols = [v for v in df.columns if v.startswith("facebook.")]
vt_cols = [v for v in df.columns if v.startswith("vt.")]

In [4]:
# Probability aggregation function
prob_agg = lambda a: 1.0 - np.prod([1.0 - v for v in a])

In [5]:
dfs = df.copy()
dfs["apility"] = dfs[apility_cols].apply(prob_agg, axis=1)
dfs["facebook"] = dfs[facebook_cols].apply(prob_agg, axis=1)
dfs["vt"] = dfs[vt_cols].apply(prob_agg, axis=1)

In [6]:
grouped = dfs[["apility", "facebook", "vt"]].groupby("device")
agg = grouped.agg(
    {
        "apility": prob_agg, "facebook": prob_agg, "vt": prob_agg
    }
)
agg["score"] = agg["apility"] + agg["facebook"] + agg["vt"]
agg

Unnamed: 0_level_0,apility,facebook,vt,score
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DESKTOP-PIMD8C0,0.044395,0.215506,0.179662,0.439563
LAPTOP-1I501C4U,0.044395,0.215506,0.179662,0.439563
MalcomWare-PC,0.008183,0.11053,0.063514,0.182227
abroad-android,0.132108,0.30148,0.075954,0.509542
boundless-mac,0.031767,0.630564,0.403379,1.065711
calcannea-cb,0.015724,0.550683,0.267896,0.834303
calcannea-mac,0.01976,0.089339,0.228773,0.337871
castle3b7c9f,0.0,0.0,0.0,0.0
daniel-chromebook,0.028562,0.022497,0.052564,0.103623
dgmac,0.0041,0.083511,0.191341,0.278952


## Cubic regression

In [7]:
model = np.polyfit(x=agg["facebook"], y=agg["vt"], deg=3)
model

array([ 2.01185257, -2.80127537,  1.81011418, -0.0202086 ])

In [8]:
predict = lambda x: model[0] * x ** 3 + model[1] * x ** 2 + model[2] * x + model[3]
agg["vt prediction"] = predict(agg["facebook"])

In [9]:
agg["error"] = abs(agg["vt"] - agg["vt prediction"])
agg

Unnamed: 0_level_0,apility,facebook,vt,score,vt prediction,error
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DESKTOP-PIMD8C0,0.044395,0.215506,0.179662,0.439563,0.259919,0.080256
LAPTOP-1I501C4U,0.044395,0.215506,0.179662,0.439563,0.259919,0.080256
MalcomWare-PC,0.008183,0.11053,0.063514,0.182227,0.148357,0.084843
abroad-android,0.132108,0.30148,0.075954,0.509542,0.326024,0.25007
boundless-mac,0.031767,0.630564,0.403379,1.065711,0.511777,0.108397
calcannea-cb,0.015724,0.550683,0.267896,0.834303,0.463069,0.195173
calcannea-mac,0.01976,0.089339,0.228773,0.337871,0.120581,0.108192
castle3b7c9f,0.0,0.0,0.0,0.0,-0.020209,0.020209
daniel-chromebook,0.028562,0.022497,0.052564,0.103623,0.019119,0.033445
dgmac,0.0041,0.083511,0.191341,0.278952,0.112592,0.078749


In [10]:
math.sqrt(agg["error"].apply(lambda x: x**2).mean())

0.17291327995096267

## GBM

### Train model

In [11]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,17 hours 6 mins
H2O cluster timezone:,Europe/London
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.8
H2O cluster version age:,2 months and 15 days
H2O cluster name:,H2O_from_python_mark_1z6qrw
H2O cluster total nodes:,1
H2O cluster free memory:,1.600 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [12]:
hf = h2o.H2OFrame(dfs.reset_index())

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [13]:
y_label = "vt"
x_label = ["facebook"]

In [14]:
model = h2o.estimators.gbm.H2OGradientBoostingEstimator(
    model_id="gbm"
)

In [15]:
train = hf

In [16]:
train, test, valid = hf.split_frame([0.7, 0.15])

In [17]:
model.train(x=x_label, y=y_label, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.00024070678871071694
RMSE: 0.015514728122358991
MAE: 0.0017342332605720454
RMSLE: 0.01277129815779636
Mean Residual Deviance: 0.00024070678871071694

ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 0.0001141399829394304
RMSE: 0.010683631542665181
MAE: 0.0013937330581153494
RMSLE: 0.008849762598241944
Mean Residual Deviance: 0.0001141399829394304
Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2018-07-05 11:08:34,0.002 sec,0.0,0.0159154,0.0018005,0.0002533,0.0115055,0.0014724,0.0001324
,2018-07-05 11:08:34,0.137 sec,1.0,0.0158400,0.0017920,0.0002509,0.0113772,0.0014627,0.0001294
,2018-07-05 11:08:34,0.141 sec,2.0,0.0157786,0.0017843,0.0002490,0.0112698,0.0014540,0.0001270
,2018-07-05 11:08:34,0.146 sec,3.0,0.0157288,0.0017774,0.0002474,0.0111798,0.0014462,0.0001250
,2018-07-05 11:08:34,0.151 sec,4.0,0.0156883,0.0017722,0.0002461,0.0111044,0.0014405,0.0001233
---,---,---,---,---,---,---,---,---,---
,2018-07-05 11:08:34,0.388 sec,46.0,0.0155147,0.0017342,0.0002407,0.0106836,0.0013937,0.0001141
,2018-07-05 11:08:34,0.390 sec,47.0,0.0155147,0.0017342,0.0002407,0.0106836,0.0013937,0.0001141
,2018-07-05 11:08:34,0.393 sec,48.0,0.0155147,0.0017342,0.0002407,0.0106836,0.0013937,0.0001141



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
facebook,0.7494571,1.0,1.0




### Prediction error on test data

In [19]:
results_hf = model.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [20]:
results = results_hf.as_data_frame(use_pandas=True)

In [21]:
test = test.as_data_frame(use_pandas=True)

In [22]:
test["vt prediction"] = results

In [23]:
test["error"] = abs(test["vt"] - test["vt prediction"])

In [24]:
test["error"].max(), test["error"].mean()

(0.45061954037180935, 0.0019976855738612844)

This table lists samples where the VirusTotal score prediction from Facebook score is above 0.05 error.

In [25]:
test[test["error"] > 0.05][["device", "domain", "count", "apility", "facebook", "vt", "vt prediction", "error"]]

Unnamed: 0,device,domain,count,apility,facebook,vt,vt prediction,error
40,mark-mac,t.co,1,0.0,0.0,0.105826,0.000723,0.105103
141,minesweepers-mac,t.co,16,0.0,0.0,0.105826,0.000723,0.105103
223,minesweepers-mac,bitbucket.org,8,0.0,0.0,0.078819,0.000723,0.078096
385,misskitty-mac,amzn.to,2,0.0,0.00123,0.092698,0.001043,0.091655
505,misskitty-mac,goo.gl,14,0.0,0.03616,0.518174,0.067554,0.45062
576,nervings-cb,bitbucket.org,1,0.0,0.0,0.078819,0.000723,0.078096
785,serotinal-mac,github.com,2,0.0,0.0,0.061239,0.000723,0.060516
939,MalcomWare-PC,ddns.net,2626,0.0,0.03616,0.002908,0.067554,0.064647
1055,dgmac,github.com,62,0.0,0.0,0.061239,0.000723,0.060516
1169,boundless-mac,bitbucket.org,1,0.0,0.0,0.078819,0.000723,0.078096
