In [1]:
%load_ext autoreload
%autoreload 2

# Imports
import json
import copy
import tensorflow as tf
import recommender
from recommender import FactorizationRecommender
from matrix import ReportTechniqueMatrix
from matrix_builder import ReportTechniqueMatrixBuilder
import random
import math
import importlib
import pandas as pd
import numpy as np
import sklearn.manifold
import matplotlib.pyplot as plt
from utils import get_mitre_technique_ids_to_names
import copy

tf.config.run_functions_eagerly(True)

assert tf.executing_eagerly()

importlib.reload(recommender)

<module 'recommender' from '/Users/mjturner/code/technique-inference-engine/models/recommender/__init__.py'>

In [2]:
class TechniqueInferenceEngine:
    """A technique inference engine.

    The technique inference engine predicts, given a bag of MITRE
    ATT&CK techniques, the next most likely techniques that would be part
    of that report, given the report dataset provided.
    """

    # Abstraction function:
    #	AF(training_data, test_data, model, enterprise_attack_filepath) =
    #       a technique inference engine to be trained using model on
    #       training_data and evaluated on test_data
    #       according to the MITRE ATT&CK framework specified in enterprise_attack_filepath.
    # Rep invariant:
    # - training_data.shape == test_data.shape
    # - model is not None
    # - len(enterprise_attack_filepath) >= 0
    # Safety from rep exposure:
    # - all attributes are private
    # - training_data and test_data are immutable
    # - model is deep copied and never returned

    def __init__(self, training_data: ReportTechniqueMatrix, test_data: ReportTechniqueMatrix, model: FactorizationRecommender, enterprise_attack_filepath: str):
        """Initializes a TechniqueInferenceEngine object.

        Args:
            training_data: the data on which to train the model.
            test_data: the data on which to evaluate the model's performance.
            model: the model to train.
            enterprise_attack_filepath: filepath for the MITRE enterprise ATT&CK json information.
        """
        self._enterprise_attack_filepath = enterprise_attack_filepath

        self._training_data = training_data
        self._test_data = test_data
        self._model = copy.deepcopy(model)

        self._checkrep()

    def _checkrep(self):
        """Asserts the rep invariant."""
        # - training_data.shape == test_data.shape
        assert self._training_data.shape == self._test_data.shape
        # - model is not None
        assert self._model is not None
        # - len(enterprise_attack_filepath) >= 0
        assert len(self._enterprise_attack_filepath) >= 0

    def _add_technique_name_to_dataframe(self, data: pd.DataFrame):
        """Adds a technique name column to the dataframe.

        Args:
            data: data indexed by technique id.

        Mutates:
            data to add a column titled "technique_name"
        """
        all_mitre_technique_ids_to_names = get_mitre_technique_ids_to_names(self._enterprise_attack_filepath)
        data.loc[:,"technique_name"] = data.apply(lambda row: all_mitre_technique_ids_to_names.get(row.name), axis=1)

    def fit(self, learning_rate: float=10., num_iterations: int=1000, regularization_coefficient: float=0.1, gravity_coefficient: float=0.0) -> float:
        """Fit the model to the data.

        Args:
            learning_rate: learning rate for the optimizer.
            num_iterations: number of iterations for the optimizer.
            regularization_coefficient: coefficient for the regularization term, which
                is the sum of the average squared magnitude of each of both the
                technique and report embeddings.
            gravity_coefficient: coefficient for the gravity term, which is the average
                of the squared entries of the prediction matrix, or alternatively,
                the squared Frobenius norm of the prediction matrix P divided by the number
                of entries in P.  Note that this is proportional to penalizing the sum
                of the squares of the singular values of P.

        Returns:
            The MSE of the prediction matrix, as determined by the test set.
        """
        # train
        self._model.fit(self._training_data.to_sparse_tensor(), num_iterations=num_iterations, learning_rate=learning_rate, regularization_coefficient=regularization_coefficient, gravity_coefficient=gravity_coefficient)

        mean_squared_error = self._model.evaluate(self._test_data.to_sparse_tensor())

        self._checkrep()
        return mean_squared_error

    def predict(self) -> pd.DataFrame:
        """Obtains model predictions.

        For each report, predicts a value for every technique based on the likelihood
        that technique should be featured in the report.  A higher predicted value for
        technique a than technique b represents an inference that technique a is more
        likely in the report than technique b.

        Returns:
            A matrix with the same shape, index, and columns as training_data and test_data
                containing the predictions values for each report and technique combination.
        """
        predictions = self._model.predict()

        predictions_dataframe = pd.DataFrame(predictions, index=self._training_data.report_ids, columns=self._training_data.technique_ids)

        self._checkrep()
        return predictions_dataframe

    def view_prediction_performance_table_for_report(self, report_id: int) -> pd.DataFrame:
        """Gets the training data, test data, and predictions for a particular report.

        Args:
            report_id: identifier for the report.  Must be in the training_data and
                test_data.

        Returns:
            A length len(training_data) dataframe indexed by technique id containing the following columns:
                - predictions, the predicted value for that echnique
                - training_data: 1 if technique was present in the input, 0 otherwise
                - test_data: all 0's since no test data for cold start predictions
                - technique_name: the technique name for the identifying technique in the index
        """
        report_data = pd.DataFrame(
            {
                "predictions": self.predict().loc[report_id],
                "training_data": self._training_data.to_pandas().loc[report_id],
                "test_data": self._test_data.to_pandas().loc[report_id],
            }
        )

        # add name for convenience
        self._add_technique_name_to_dataframe(report_data)

        self._checkrep()
        return report_data

    def predict_for_new_report(self, techniques: frozenset[str], learning_rate: float=1., num_iterations: int=10, regularization_coefficient: float=0.1, gravity_coefficient: float=0.0) -> pd.DataFrame:
        """Predicts for a new, yet-unseen report.

        Args:
            techniques: an iterable of MITRE technique identifiers involved
                in the new report.

        Returns:
            A length n dataframe indexed by technique id containing the following columns:
                - predictions, the predicted value for that echnique
                - training_data: 1 if technique was present in the input, 0 otherwise
                - test_data: all 0's since no test data for cold start predictions
                - technique_name: the technique name for the identifying technique in the index
        """
        # need to turn into the embeddings in the original matrix
        all_technique_ids = self._training_data.technique_ids
        technique_ids_to_indices = {all_technique_ids[i]: i for i in range(len(all_technique_ids))}

        technique_indices = list(set(technique_ids_to_indices[technique] for technique in techniques))
        technique_indices.sort()
        technique_indices_2d = np.expand_dims(np.array(technique_indices), axis=1)

        # 1 for each index
        values = np.ones((len(technique_indices),))
        n = self._training_data.n

        technique_tensor = tf.SparseTensor(
            indices=technique_indices_2d, values=values, dense_shape=(n,)
        )

        predictions = self._model.predict_new_entity(
            technique_tensor,
            num_iterations=num_iterations,
            learning_rate=learning_rate,
            regularization_coefficient=regularization_coefficient,
            gravity_coefficient=gravity_coefficient
        )

        training_indices_dense = np.zeros(len(predictions))
        training_indices_dense[technique_indices] = 1
        result_dataframe = pd.DataFrame(
            {
                "predictions": predictions,
                "training_data": training_indices_dense,
                "test_data": np.zeros(len(predictions)),
            },
            index=all_technique_ids
        )

        self._add_technique_name_to_dataframe(result_dataframe)

        self._checkrep()
        return result_dataframe


In [3]:
# data locations
dataset_filepath = "../data/combined_dataset_full_frequency.json"
enterprise_attack_filepath = "../enterprise-attack.json"

# hyperparameters
test_ratio = 0.1
learning_rate = 20
num_iterations = 1000
regularization_coefficient = 0.1
gravity_coefficient = 0.0
embedding_dimension = 10

# make data
data_builder = ReportTechniqueMatrixBuilder(
    combined_dataset_filepath=dataset_filepath,
    enterprise_attack_filepath=enterprise_attack_filepath,
)
data = data_builder.build()
# split into training and test data
train_indices = frozenset(random.sample(data.indices, k=math.floor((1-test_ratio) * len(data.indices))))
test_indices = frozenset(data.indices).difference(train_indices)
training_data = data.mask(train_indices)
test_data = data.mask(test_indices)

model = FactorizationRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)

tie = TechniqueInferenceEngine(
    training_data=training_data,
    test_data=test_data,
    model=model,
    enterprise_attack_filepath=enterprise_attack_filepath,
)
mse = tie.fit(
    learning_rate=learning_rate,
    num_iterations=num_iterations,
    regularization_coefficient=regularization_coefficient,
    gravity_coefficient=gravity_coefficient,
)
print("Mean Squared Error", mse)

Mean Squared Error 0.08515729


In [4]:
predictions_dataframe = tie.predict()

print(predictions_dataframe.head())

   T1552.008  T1218.010  T1003.001  T1505.002     T1014  T1053.006     T1590  \
0   0.123779   0.981658   0.974634   0.306022  0.863459   0.110259  0.458235   
1   0.124634   0.619562   0.894080   0.185659  0.885770   0.060220  0.662852   
2   0.046796   0.804721   0.894986   0.207727  0.869207   0.127837  0.588968   
3   0.108201   0.761405   0.975009   0.217126  0.953664   0.099089  0.801844   
4   0.074938   0.804142   0.857130   0.203291  0.845070   0.109497  0.592299   

      T1092  T1565.002  T1098.005  ...  T1542.005  T1564.011  T1606.002  \
0  0.490053   0.352858   0.594495  ...   0.165993   0.046939   0.092341   
1  0.368948   0.183464   0.202766  ...   0.276138   0.065492   0.315473   
2  0.450499   0.240886   0.357333  ...   0.090423   0.099261   0.197262   
3  0.460376   0.226560   0.348786  ...   0.080831   0.145213   0.420383   
4  0.451148   0.246823   0.396207  ...   0.070812   0.106672   0.348524   

      T1528  T1071.002  T1558.002     T1597  T1218.012  T1055.004  T

In [5]:
existing_prediction = tie.view_prediction_performance_table_for_report(3)

print(existing_prediction.head())

           predictions  training_data  test_data   technique_name
T1552.008     0.108201            0.0        0.0    Chat Messages
T1218.010     0.761405            0.0        0.0         Regsvr32
T1003.001     0.975009            0.0        0.0     LSASS Memory
T1505.002     0.217126            0.0        0.0  Transport Agent
T1014         0.953664            0.0        0.0          Rootkit


In [6]:
# COLD START PREDICTOR
# use one of the below sets of techniques or use your own!

# training_techniques = set(['T1021.004', 'T1572', 'T1083', 'T1570', 'T1571', 'T1105', 'T1003.001',
#     'T1005', 'T1056.001', 'T1140', 'T1555.003', 'T1569.002', 'T1113',
#     'T1018', 'T1112'
# ])
oilrig_techniques = {
    "T1047", "T1059.005", "T1124", "T1082",
    "T1497.001", "T1053.005", "T1027", "T1105",
    "T1070.004", "T1059.003", "T1071.001"}
oilrig_adversary_emulation_plan_techniques = {"T1082", "T1033", "T1016", "T1087.002", "T1069.002", "T1021", "T1087.001", "T1069.001", "T1049", "T1057", "T1007", "T1012", "T1018", "T1555.004", "T1041", "T1105", "T1505.003", "T1003.001", "T1070.004", "T1572", "T1021.001", "T1569.002", "T1083", "T1048.003"}
# solarigate_techniques = set(["T1087", "T1087.002", "T1098.001", "T1098.002", "T1098.003", "T1098.005",
#     "T1583.001", "T1071.001", "T1560.001", "T1059.001", "T1059.003", "T1059.005",
#     "T1584.001", "T1555", "T1555.003", "T1213", "T1213.003", "T1005", "T1074.002",
#     "T1140", "T1587.001", "T1484.002", "T1482", "T1568", "T1114.002", "T1546.003",
#     "T1048.002", "T1190", "T1133", "T1083", "T1606.001", "T1606.002", "T1589.001",
#     "T1562.001", "T1562.002", "T1562.004", "T1070", "T1070.004", "T1070.006", "T1070.008",
#     "T1105", "T1036", "T1036.004", "T1036.005", "T1003.006", "T1069", "T1069.002",
#     "T1057", "T1090.001", "T1021.001", "T1021.002", "T1021.006", "T1018", "T1053.005",
#     "T1558.003", "T1539", "T1559.002", "T1195.002", "T1218.011", "T1082", "T1016.001",
#     "T1199", "T1552.004", "T1550", "T1550.001", "T1550.004", "T1078",
#     "T1078.002", "T1078.003", "T1078.004", "T1047",
# ])
new_report_predictions = tie.predict_for_new_report(
    oilrig_techniques,
    learning_rate=0.5,
    num_iterations=10,
    regularization_coefficient=0.1,
    gravity_coefficient=1,
)

print(new_report_predictions.sort_values(by="predictions", ascending=False).head(10))

           predictions  training_data  test_data  \
T1547.006     1.413013            0.0        0.0   
T1221         1.275502            0.0        0.0   
T1587.001     1.204465            0.0        0.0   
T1562.002     1.167807            0.0        0.0   
T1199         1.157537            0.0        0.0   
T1543.003     1.142744            0.0        0.0   
T1021.002     1.135721            0.0        0.0   
T1568         1.128376            0.0        0.0   
T1132.002     1.127620            0.0        0.0   
T1110.003     1.120784            0.0        0.0   

                          technique_name  
T1547.006  Kernel Modules and Extensions  
T1221                 Template Injection  
T1587.001                        Malware  
T1562.002  Disable Windows Event Logging  
T1199               Trusted Relationship  
T1543.003                Windows Service  
T1021.002       SMB/Windows Admin Shares  
T1568                 Dynamic Resolution  
T1132.002          Non-Standard Encoding