# Correlation between different threat exchange scores - IP vector

Correlation between VirusTotal and Facebook scores.

## Setup and load data

In [1]:
import pickle
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import h2o

In [2]:
df = pickle.load(open("vector.IPS", "rb"))

In [3]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,first,last,gap,count,apility.ALIENVAULT-REPUTATION,apility.BBCAN177-MS1,apility.BBCAN177-MS3,apility.BITNODES-IO-1D,apility.BITNODES-IO-30D,apility.BITNODES-IO-7D,...,vt.f6,vt.f7,vt.f8,vt.f9,vt.fa,vt.fb,vt.fc,vt.fd,vt.fe,vt.ff
device,ip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
languid-mac,69.172.216.58,1.529334e+09,1.530122e+09,2095200.0,61,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,69.172.216.56,1.528222e+09,1.530126e+09,1112400.0,156,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,69.172.216.55,1.529334e+09,1.530126e+09,2095200.0,325,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,104.16.15.243,1.530029e+09,1.530032e+09,2790000.0,12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,173.241.242.143,1.528218e+09,1.530198e+09,1126800.0,318,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,35.153.236.75,1.530238e+09,1.530238e+09,2998800.0,19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,108.59.4.172,1.530029e+09,1.530029e+09,2790000.0,17,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,52.44.203.69,1.530126e+09,1.530126e+09,2887200.0,21,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,52.4.98.78,1.529334e+09,1.529334e+09,2095200.0,13,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
languid-mac,34.200.4.68,1.530029e+09,1.530029e+09,2790000.0,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


## Aggregation across device

In [4]:
# Construct lists of column names
apility_cols = [v for v in df.columns if v.startswith("apility.")]
facebook_cols = [v for v in df.columns if v.startswith("facebook.")]
vt_cols = [v for v in df.columns if v.startswith("vt.")]

In [5]:
# Probability aggregation function
prob_agg = lambda a: 1.0 - np.prod([1.0 - v for v in a])

In [6]:
dfs = df.copy()
dfs["apility"] = dfs[apility_cols].apply(prob_agg, axis=1)
dfs["facebook"] = dfs[facebook_cols].apply(prob_agg, axis=1)
dfs["vt"] = dfs[vt_cols].apply(prob_agg, axis=1)

In [7]:
grouped = dfs[["apility", "facebook", "vt"]].groupby("ip")
agg = grouped.agg(
    {
        "apility": prob_agg, "facebook": prob_agg, "vt": prob_agg
    }
)
agg["score"] = agg["apility"] + agg["facebook"] + agg["vt"]
agg

Unnamed: 0_level_0,apility,facebook,vt,score
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10.0.0.1,0.000000,0.00000,0.000000,0.000000
10.0.1.1,0.000000,0.00000,0.000000,0.000000
10.0.1.11,0.000000,0.00000,0.000000,0.000000
10.0.1.15,0.000000,0.00000,0.000000,0.000000
10.0.1.9,0.000000,0.00000,0.000000,0.000000
10.0.255.255,0.000000,0.00000,0.000000,0.000000
10.100.4.118,0.000000,0.00000,0.000000,0.000000
10.120.136.116,0.000000,0.00000,0.000000,0.000000
10.150.0.2,0.000000,0.00000,0.000000,0.000000
10.255.255.255,0.000000,0.00000,0.000000,0.000000


## Cubic regression

In [8]:
model = np.polyfit(x=agg["apility"], y=agg["facebook"], deg=3)
model

array([-0.18285369,  0.15756183,  0.00759377,  0.00040144])

In [9]:
predict = lambda x: model[0] * x ** 3 + model[1] * x ** 2 + model[2] * x + model[3]
agg["facebook prediction"] = predict(agg["apility"])

In [10]:
agg["error"] = abs(agg["facebook"] - agg["facebook prediction"])
agg

Unnamed: 0_level_0,apility,facebook,vt,score,facebook prediction,error
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10.0.0.1,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.0.1.1,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.0.1.11,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.0.1.15,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.0.1.9,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.0.255.255,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.100.4.118,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.120.136.116,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.150.0.2,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401
10.255.255.255,0.000000,0.00000,0.000000,0.000000,0.000401,0.000401


In [11]:
math.sqrt(agg["error"].apply(lambda x: x**2).mean())

0.006700386703148847

## GBM

### Train model

In [12]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,17 hours 3 mins
H2O cluster timezone:,Europe/London
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.8
H2O cluster version age:,2 months and 15 days
H2O cluster name:,H2O_from_python_mark_1z6qrw
H2O cluster total nodes:,1
H2O cluster free memory:,1.604 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [13]:
hf = h2o.H2OFrame(dfs.reset_index())

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [14]:
y_label = "facebook"
x_label = ["apility"]

In [15]:
model = h2o.estimators.gbm.H2OGradientBoostingEstimator(
    model_id="gbm"
)

In [16]:
train = hf

In [17]:
train, test, valid = hf.split_frame([0.7, 0.15])

In [18]:
model.train(x=x_label, y=y_label, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [19]:
model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 1.1926089196784694e-05
RMSE: 0.0034534170319821924
MAE: 0.0003161512659614243
RMSLE: 0.003244064306230941
Mean Residual Deviance: 1.1926089196784694e-05

ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 2.7305243713015006e-05
RMSE: 0.005225441963414674
MAE: 0.0003786834501182767
RMSLE: 0.004701235941233018
Mean Residual Deviance: 2.7305243713015006e-05
Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2018-07-05 11:06:17,0.002 sec,0.0,0.0034560,0.0003189,0.0000119,0.0052264,0.0003812,0.0000273
,2018-07-05 11:06:17,0.207 sec,1.0,0.0034556,0.0003186,0.0000119,0.0052257,0.0003808,0.0000273
,2018-07-05 11:06:17,0.222 sec,2.0,0.0034553,0.0003184,0.0000119,0.0052256,0.0003806,0.0000273
,2018-07-05 11:06:17,0.256 sec,3.0,0.0034549,0.0003181,0.0000119,0.0052256,0.0003804,0.0000273
,2018-07-05 11:06:17,0.277 sec,4.0,0.0034547,0.0003179,0.0000119,0.0052256,0.0003802,0.0000273
---,---,---,---,---,---,---,---,---,---
,2018-07-05 11:06:18,0.957 sec,46.0,0.0034534,0.0003162,0.0000119,0.0052254,0.0003787,0.0000273
,2018-07-05 11:06:18,1.000 sec,47.0,0.0034534,0.0003162,0.0000119,0.0052254,0.0003787,0.0000273
,2018-07-05 11:06:18,1.011 sec,48.0,0.0034534,0.0003162,0.0000119,0.0052254,0.0003787,0.0000273



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
apility,0.0035156,1.0,1.0




### Prediction error on test data

In [20]:
results_hf = model.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [21]:
results = results_hf.as_data_frame(use_pandas=True)

In [22]:
test = test.as_data_frame(use_pandas=True)

In [23]:
test["facebook prediction"] = results

In [24]:
test["error"] = abs(test["facebook"] - test["facebook prediction"])

In [25]:
test["error"].max(), test["error"].mean()

(0.1606096725084005, 0.0003644035049832933)

This table lists samples where the VirusTotal score prediction from Facebook score is above 0.02 error.

In [26]:
test[test["error"] > 0.02][["device", "ip", "count", "apility", "facebook", "vt", "facebook prediction", "error"]]

Unnamed: 0,device,ip,count,apility,facebook,vt,facebook prediction,error
137,languid-mac,54.231.97.235,5,0.0,0.098786,0.0,0.000158,0.098628
387,languid-mac,192.185.139.14,32,0.0,0.05945,0.0,0.000158,0.059292
1475,boundless-mac,104.28.2.42,35,0.0,0.04899,0.011549,0.000158,0.048832
1671,boundless-mac,54.231.49.104,36,0.0,0.098786,0.0,0.000158,0.098628
1980,dramatic-mac,184.168.47.225,24,0.0,0.050009,0.0,0.000158,0.049851
2140,dramatic-mac,66.231.91.47,3,0.0,0.049414,0.0,0.000158,0.049256
2189,dramatic-mac,104.16.12.231,62,0.0,0.049414,0.0,0.000158,0.049256
2484,dramatic-mac,104.16.10.231,141,0.0,0.050009,0.0,0.000158,0.049851
2511,dramatic-mac,104.28.10.72,10,0.0,0.094517,0.0,0.000158,0.094359
2589,dramatic-mac,107.180.50.186,110,0.0,0.05945,0.0,0.000158,0.059292
