In [1]:
%load_ext autoreload
%autoreload 2

# Imports
import json
import copy
import tensorflow as tf
import recommender
from recommender import FactorizationRecommender
from matrix import ReportTechniqueMatrix
from matrix_builder import ReportTechniqueMatrixBuilder
import random
import math
import importlib
import pandas as pd
import numpy as np
import sklearn.manifold
import matplotlib.pyplot as plt
from utils import get_mitre_technique_ids_to_names
import copy

tf.config.run_functions_eagerly(True)

assert tf.executing_eagerly()

importlib.reload(recommender)

<module 'recommender' from '/Users/mjturner/code/technique-inference-engine/models/recommender/__init__.py'>

In [2]:
class TechniqueInferenceEngine:
    """A technique inference engine.

    The technique inference engine predicts, given a bag of MITRE
    ATT&CK techniques, the next most likely techniques that would be part
    of that report, given the report dataset provided.
    """

    # Abstraction function:
    #	AF(training_data, test_data, model, enterprise_attack_filepath) =
    #       a technique inference engine to be trained using model on
    #       training_data and evaluated on test_data
    #       according to the MITRE ATT&CK framework specified in enterprise_attack_filepath.
    # Rep invariant:
    # - training_data.shape == test_data.shape
    # - model is not None
    # - len(enterprise_attack_filepath) >= 0
    # Safety from rep exposure:
    # - all attributes are private
    # - training_data and test_data are immutable
    # - model is deep copied and never returned

    def __init__(self, training_data: ReportTechniqueMatrix, test_data: ReportTechniqueMatrix, model: FactorizationRecommender, enterprise_attack_filepath: str):
        """Initializes a TechniqueInferenceEngine object.

        Args:
            training_data: the data on which to train the model.
            test_data: the data on which to evaluate the model's performance.
            model: the model to train.
            enterprise_attack_filepath: filepath for the MITRE enterprise ATT&CK json information.
        """
        self._enterprise_attack_filepath = enterprise_attack_filepath

        self._training_data = training_data
        self._test_data = test_data
        self._model = copy.deepcopy(model)

        self._checkrep()

    def _checkrep(self):
        """Asserts the rep invariant."""
        # - training_data.shape == test_data.shape
        assert self._training_data.shape == self._test_data.shape
        # - model is not None
        assert self._model is not None
        # - len(enterprise_attack_filepath) >= 0
        assert len(self._enterprise_attack_filepath) >= 0

    def _add_technique_name_to_dataframe(self, data: pd.DataFrame):
        """Adds a technique name column to the dataframe.

        Args:
            data: data indexed by technique id.

        Mutates:
            data to add a column titled "technique_name"
        """
        all_mitre_technique_ids_to_names = get_mitre_technique_ids_to_names(self._enterprise_attack_filepath)
        data.loc[:,"technique_name"] = data.apply(lambda row: all_mitre_technique_ids_to_names.get(row.name), axis=1)

    def fit(self, learning_rate: float=10., num_iterations: int=1000, regularization_coefficient: float=0.1, gravity_coefficient: float=0.0) -> float:
        """Fit the model to the data.

        Args:
            learning_rate: learning rate for the optimizer.
            num_iterations: number of iterations for the optimizer.
            regularization_coefficient: coefficient for the regularization term, which
                is the sum of the average squared magnitude of each of both the
                technique and report embeddings.
            gravity_coefficient: coefficient for the gravity term, which is the average
                of the squared entries of the prediction matrix, or alternatively,
                the squared Frobenius norm of the prediction matrix P divided by the number
                of entries in P.  Note that this is proportional to penalizing the sum
                of the squares of the singular values of P.

        Returns:
            The MSE of the prediction matrix, as determined by the test set.
        """
        # train
        self._model.fit(self._training_data.to_sparse_tensor(), num_iterations=num_iterations, learning_rate=learning_rate, regularization_coefficient=regularization_coefficient, gravity_coefficient=gravity_coefficient)

        mean_squared_error = self._model.evaluate(self._test_data.to_sparse_tensor())

        self._checkrep()
        return mean_squared_error

    def predict(self) -> pd.DataFrame:
        """Obtains model predictions.

        For each report, predicts a value for every technique based on the likelihood
        that technique should be featured in the report.  A higher predicted value for
        technique a than technique b represents an inference that technique a is more
        likely in the report than technique b.

        Returns:
            A matrix with the same shape, index, and columns as training_data and test_data
                containing the predictions values for each report and technique combination.
        """
        predictions = self._model.predict()

        predictions_dataframe = pd.DataFrame(predictions, index=self._training_data.report_ids, columns=self._training_data.technique_ids)

        self._checkrep()
        return predictions_dataframe

    def view_prediction_performance_table_for_report(self, report_id: int) -> pd.DataFrame:
        """Gets the training data, test data, and predictions for a particular report.

        Args:
            report_id: identifier for the report.  Must be in the training_data and
                test_data.

        Returns:
            A length len(training_data) dataframe indexed by technique id containing the following columns:
                - predictions, the predicted value for that echnique
                - training_data: 1 if technique was present in the input, 0 otherwise
                - test_data: all 0's since no test data for cold start predictions
                - technique_name: the technique name for the identifying technique in the index
        """
        report_data = pd.DataFrame(
            {
                "predictions": self.predict().loc[report_id],
                "training_data": self._training_data.to_pandas().loc[report_id],
                "test_data": self._test_data.to_pandas().loc[report_id],
            }
        )

        # add name for convenience
        self._add_technique_name_to_dataframe(report_data)

        self._checkrep()
        return report_data

    def predict_for_new_report(self, techniques: frozenset[str], learning_rate: float=1., num_iterations: int=10, regularization_coefficient: float=0.1, gravity_coefficient: float=0.0) -> pd.DataFrame:
        """Predicts for a new, yet-unseen report.

        Args:
            techniques: an iterable of MITRE technique identifiers involved
                in the new report.

        Returns:
            A length n dataframe indexed by technique id containing the following columns:
                - predictions, the predicted value for that echnique
                - training_data: 1 if technique was present in the input, 0 otherwise
                - test_data: all 0's since no test data for cold start predictions
                - technique_name: the technique name for the identifying technique in the index
        """
        # need to turn into the embeddings in the original matrix
        all_technique_ids = self._training_data.technique_ids
        technique_ids_to_indices = {all_technique_ids[i]: i for i in range(len(all_technique_ids))}

        technique_indices = list(set(technique_ids_to_indices[technique] for technique in techniques))
        technique_indices.sort()
        technique_indices_2d = np.expand_dims(np.array(technique_indices), axis=1)

        # 1 for each index
        values = np.ones((len(technique_indices),))
        n = self._training_data.n

        technique_tensor = tf.SparseTensor(
            indices=technique_indices_2d, values=values, dense_shape=(n,)
        )

        predictions = self._model.predict_new_entity(
            technique_tensor,
            num_iterations=num_iterations,
            learning_rate=learning_rate,
            regularization_coefficient=regularization_coefficient,
            gravity_coefficient=gravity_coefficient
        )

        training_indices_dense = np.zeros(len(predictions))
        training_indices_dense[technique_indices] = 1
        result_dataframe = pd.DataFrame(
            {
                "predictions": predictions,
                "training_data": training_indices_dense,
                "test_data": np.zeros(len(predictions)),
            },
            index=all_technique_ids
        )

        self._add_technique_name_to_dataframe(result_dataframe)

        self._checkrep()
        return result_dataframe


In [3]:
# data locations
dataset_filepath = "../data/combined_dataset_full_frequency.json"
enterprise_attack_filepath = "../enterprise-attack.json"

# hyperparameters
test_ratio = 0.1
regularization_coefficient = 0.1
gravity_coefficient = 0.0
embedding_dimension = 10

# make data
data_builder = ReportTechniqueMatrixBuilder(
    combined_dataset_filepath=dataset_filepath,
    enterprise_attack_filepath=enterprise_attack_filepath,
)
data = data_builder.build()
# split into training and test data
train_indices = frozenset(random.sample(data.indices, k=math.floor((1-test_ratio) * len(data.indices))))
test_indices = frozenset(data.indices).difference(train_indices)
training_data = data.mask(train_indices)
test_data = data.mask(test_indices)

model = FactorizationRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)

tie = TechniqueInferenceEngine(
    training_data=training_data,
    test_data=test_data,
    model=model,
    enterprise_attack_filepath=enterprise_attack_filepath,
)
mse = tie.fit()
print("Mean Squared Error", mse)

Mean Squared Error 0.09391276


In [4]:
predictions_dataframe = tie.predict()

print(predictions_dataframe.head())

   T1542.005  T1110.002     T1569  T1059.001     T1047     T1104  T1584.006  \
0  -0.011291   0.753188  0.789474   1.066873  1.018568  0.920684   0.187430   
1   0.053826   0.448769  0.730581   1.002950  0.934759  0.837245   0.386502   
2   0.124792   0.391861  0.780131   0.944612  0.856811  0.935902   0.448374   
3   0.108158   0.549947  0.875634   0.916077  0.962375  0.986291   0.091467   
4   0.182644   0.502834  0.737028   0.828575  0.825199  0.876431   0.329390   

   T1021.006  T1552.008     T1567  ...  T1555.001  T1583.006  T1546.008  \
0   0.501572   0.179937  0.842914  ...   0.188886   0.830106   0.409474   
1  -0.058391   0.155847  0.860217  ...   0.353806   0.590756   0.623205   
2   0.154832   0.056919  0.880881  ...   0.217941   0.636388   0.622410   
3   0.225477   0.041074  0.926031  ...   0.064310   0.761495   0.541357   
4   0.248087   0.081102  0.745328  ...   0.168540   0.685987   0.466555   

   T1591.004     T1072  T1003.007     T1564  T1498.002  T1036.004  T1056.0

In [5]:
existing_prediction = tie.view_prediction_performance_table_for_report(3)

print(existing_prediction.head())

           predictions  training_data  test_data  \
T1542.005     0.108158            0.0        0.0   
T1110.002     0.549947            0.0        0.0   
T1569         0.875634            0.0        0.0   
T1059.001     0.916077            0.0        0.0   
T1047         0.962375            0.0        0.0   

                               technique_name  
T1542.005                           TFTP Boot  
T1110.002                   Password Cracking  
T1569                         System Services  
T1059.001                          PowerShell  
T1047      Windows Management Instrumentation  


In [6]:
techniques = set(['T1021.004', 'T1572', 'T1083', 'T1570', 'T1571', 'T1105', 'T1003.001',
    'T1005', 'T1056.001', 'T1140', 'T1555.003', 'T1569.002', 'T1113',
    'T1018', 'T1112'
])
# techniques = set(["T1566.001"])
new_report_predictions = tie.predict_for_new_report(
    techniques,
    learning_rate=0.5,
    num_iterations=10,
    regularization_coefficient=0.1,
    gravity_coefficient=0.0,
)

print(new_report_predictions.sort_values(by="predictions", ascending=False).head())

           predictions  training_data  test_data           technique_name
T1557         1.273326            0.0        0.0  Adversary-in-the-Middle
T1583.001     1.167332            0.0        0.0                  Domains
T1221         1.154997            0.0        0.0       Template Injection
T1125         1.138935            0.0        0.0            Video Capture
T1560.001     1.137195            0.0        0.0      Archive via Utility
