In [2]:
%load_ext autoreload
%autoreload 2

# Imports
import json
import tensorflow as tf
import recommender
from matrix import ReportTechniqueMatrix
from matrix_builder import ReportTechniqueMatrixBuilder
import random
import math
import importlib
import pandas as pd
import numpy as np
from utils import get_mitre_technique_ids_to_names

tf.config.run_functions_eagerly(True)

assert tf.executing_eagerly()

importlib.reload(recommender)

<module 'recommender' from '/Users/mjturner/code/technique-inference-engine/models/recommender/__init__.py'>

In [3]:
def train_test_split(indices: list, values: list, test_ratio: float=0.1) -> tuple:
    n = len(indices)
    assert len(values) == n

    indices_for_test_set = frozenset(random.sample(range(n), k=math.floor(test_ratio * n)))

    train_indices = []
    test_indices = []
    train_values = []
    test_values = []

    for i in range(n):
        if i in indices_for_test_set:
            test_indices.append(indices[i])
            test_values.append(values[i])
        else:
            train_indices.append(indices[i])
            train_values.append(values[i])

    return train_indices, train_values, test_indices, test_values


In [4]:
def view_prediction_performance_table_for_report(
        train_data: ReportTechniqueMatrix,
        test_data: ReportTechniqueMatrix,
        predictions: pd.DataFrame,
        report_id: int,
    ) -> pd.DataFrame:
    """Gets a dataframe to visualize the training data, test data, and predictions for a report."""
    # 1. training_data
    training_dataframe = train_data.to_pandas()
    report_train_techniques = training_dataframe.loc[report_id]
    report_train_techniques.name = "training_data"

    # 2. predictions
    predicted_techniques = predictions.loc[report_id]
    predicted_techniques.name = "predictions"

    # now test data
    test_dataframe = test_data.to_pandas()
    report_test_techniques = test_dataframe.loc[report_id]
    report_test_techniques.name = "test_data"

    report_data = pd.concat((predicted_techniques, report_train_techniques, report_test_techniques), axis=1)

    # add name for convenience
    all_mitre_technique_ids_to_names = get_mitre_technique_ids_to_names("../enterprise-attack.json")
    report_data.loc[:, "technique_name"] = report_data.apply(lambda row: all_mitre_technique_ids_to_names.get(row.name), axis=1)

    return report_data


In [24]:
test_ratio = 0.1
embedding_dimension = 10

data_builder = ReportTechniqueMatrixBuilder(
    combined_dataset_filepath="../data/combined_dataset_full_frequency.json",
    enterprise_attack_filepath="../enterprise-attack.json",
)
data = data_builder.build()

train_indices = frozenset(random.sample(data.indices, k=math.floor((1-test_ratio) * len(data.indices))))
test_indices = frozenset(data.indices).difference(train_indices)

training_data = data.mask(train_indices)
test_data = data.mask(test_indices)

# train
learning_rate = 10.
regularization_coefficient = 0.1
gravity_coefficient = 0.0
model = recommender.FactorizationRecommender(m=data.m, n=data.n, k=embedding_dimension)
model.fit(training_data.to_sparse_tensor(), num_iterations=1000, learning_rate=learning_rate, regularization_coefficient=regularization_coefficient, gravity_coefficient=gravity_coefficient)

evaluation = model.evaluate(test_data.to_sparse_tensor())
print("MSE Error", evaluation)

predictions = model.predict()

predictions_dataframe = pd.DataFrame(predictions, columns=data.technique_ids)

MSE Error 0.06764549


In [8]:
# get best and worst test performance
test_ndarray = test_data.to_numpy()
predictions_ndarray = predictions_dataframe.to_numpy()
# where test data, use predictions, else, fill with Nan
test_performance = np.mean(np.square(predictions_ndarray - test_ndarray), axis=1, where=test_ndarray > 0.5)

best_test_perf = np.nanargmin(test_performance, )
worst_test_perf = np.nanargmax(test_performance)

best_performance_results = view_prediction_performance_table_for_report(
    train_data=training_data,
    test_data=test_data,
    predictions=predictions_dataframe,
    report_id=best_test_perf
)

worst_performance_results = view_prediction_performance_table_for_report(
    train_data=training_data,
    test_data=test_data,
    predictions=predictions_dataframe,
    report_id=worst_test_perf
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [10]:
print(best_performance_results.sort_values("training_data", ascending=False).head(15).index)


Index(['T1021.004', 'T1572', 'T1083', 'T1570', 'T1571', 'T1105', 'T1003.001',
       'T1005', 'T1056.001', 'T1140', 'T1555.003', 'T1569.002', 'T1113',
       'T1018', 'T1112'],
      dtype='object')


In [12]:
print(worst_performance_results.sort_values("test_data", ascending=False).head(15))

           predictions  training_data  test_data  \
T1106        -0.030872            0.0        1.0   
T1552.001    -0.026832            0.0        0.0   
T1561        -0.000862            0.0        0.0   
T1573.001    -0.027231            0.0        0.0   
T1485        -0.000736            0.0        0.0   
T1190        -0.031759            0.0        0.0   
T1132.001     0.010440            0.0        0.0   
T1078.004    -0.028884            0.0        0.0   
T1056.001    -0.029308            0.0        0.0   
T1001.002     0.004404            0.0        0.0   
T1095        -0.023185            0.0        0.0   
T1110.003    -0.021503            0.0        0.0   
T1078.002    -0.027470            0.0        0.0   
T1218.011    -0.031216            0.0        0.0   
T1553.004     0.005378            0.0        0.0   

                              technique_name  
T1106                             Native API  
T1552.001               Credentials In Files  
T1561                     

In [39]:
def predict_for_new_report(techniques: frozenset[str]) -> np.array:
    """Predicts for a new, yet-unseen report.

    Args:
        techniques: an iterable of MITRE technique identifiers involved
            in the new report.

    Returns:
        A length n array of predicted values for each technique based on
        the projected embedding for this new report.
    """
    # need to turn into the embeddings in the original matrix
    technique_ids = data.technique_ids
    technique_ids_to_indices = {technique_ids[i]: i for i in range(len(technique_ids))}

    technique_indices = list(set(technique_ids_to_indices[technique] for technique in techniques))
    technique_indices.sort()
    technique_indices = np.expand_dims(np.array(technique_indices), axis=1)

    # 1 for each index
    values = np.ones((len(technique_indices),))
    n = data.n

    technique_tensor = tf.SparseTensor(
        indices=technique_indices, values=values, dense_shape=(n,)
    )

    # TODO referencing global vars bad
    ret = model.predict_new_entity(
        technique_tensor,
        num_iterations=20,
        learning_rate=0.1,
        regularization_coefficient=regularization_coefficient,
        gravity_coefficient=gravity_coefficient
    )

    return ret


In [40]:
techniques = set(['T1021.004', 'T1572', 'T1083', 'T1570', 'T1571', 'T1105', 'T1003.001',
    'T1005', 'T1056.001', 'T1140', 'T1555.003', 'T1569.002', 'T1113',
    'T1018', 'T1112'
])
x = predict_for_new_report(techniques)

print(x)

loss tf.Tensor(0.033160407, shape=(), dtype=float32)
tf.Tensor(
[[ 4.8950601e-01]
 [ 6.8958026e-01]
 [ 6.2146771e-01]
 [ 1.0590014e+00]
 [ 1.0454527e+00]
 [ 3.7314528e-01]
 [ 6.5028858e-01]
 [ 1.0647360e+00]
 [ 4.9841127e-01]
 [ 8.9749163e-01]
 [-1.8051956e-04]
 [ 6.7003624e-04]
 [ 5.3411239e-01]
 [ 7.1925819e-01]
 [ 3.7383619e-01]
 [ 4.9433786e-01]
 [ 5.7834327e-01]
 [ 3.3562416e-01]
 [ 7.2748697e-01]
 [ 1.0483046e+00]
 [ 7.9620552e-01]
 [ 6.1776447e-01]
 [ 5.4772866e-01]
 [ 4.8653650e-01]
 [ 8.9041799e-01]
 [ 1.0948020e+00]
 [ 6.8859237e-01]
 [ 1.0783241e+00]
 [ 7.6140428e-01]
 [ 6.3541889e-01]
 [ 1.0661325e+00]
 [ 7.1036959e-01]
 [ 3.3123136e-01]
 [ 6.9111091e-01]
 [ 7.7904648e-01]
 [ 7.1335447e-01]
 [ 3.9734828e-01]
 [ 5.0828284e-01]
 [ 2.3043612e-01]
 [ 2.6067456e-03]
 [ 1.0478625e+00]
 [ 7.0972306e-01]
 [ 7.5936973e-01]
 [ 1.0854068e+00]
 [ 6.5185308e-01]
 [ 1.0363854e+00]
 [ 5.4504496e-01]
 [ 4.3857956e-01]
 [ 6.0241115e-01]
 [ 6.1821246e-01]
 [ 9.8998171e-01]
 [ 5.2852488e-01]
