In [194]:
%load_ext autoreload
%autoreload 2

# Imports
import json
from mitreattack.stix20 import MitreAttackData
import tensorflow as tf
import recommender
import random
import math
import importlib
import pandas as pd

tf.config.run_functions_eagerly(True)

assert tf.executing_eagerly()

importlib.reload(recommender)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'recommender' from '/Users/mjturner/code/technique-inference-engine/models/recommender/__init__.py'>

In [195]:
def get_mitre_technique_ids(stix_filepath: str) -> frozenset[str]:
    """Gets all MITRE technique ids."""
    mitre_attack_data = MitreAttackData(stix_filepath)
    techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)

    all_technique_ids = set()

    for technique in techniques:
        external_references = technique.get("external_references")
        mitre_references = tuple(filter(lambda external_reference: external_reference.get("source_name") == "mitre-attack", external_references))
        assert len(mitre_references) == 1
        mitre_technique_id = mitre_references[0]["external_id"]
        all_technique_ids.add(mitre_technique_id)

    return frozenset(all_technique_ids)

def get_campaign_techniques(filepath: str) -> tuple[frozenset[str]]:
    """Gets a set of MITRE technique ids present in each campaign."""

    with open(filepath) as f:
        data = json.load(f)

    campaigns = data["bags_of_techniques"]

    ret = []

    for campaign in campaigns:

        techniques = campaign["mitre_techniques"]
        ret.append(frozenset(techniques.keys()))

    return ret

def train_test_split(indices: list, values: list, test_ratio: float=0.1) -> tuple:
    n = len(indices)
    assert len(values) == n

    indices_for_test_set = frozenset(random.sample(range(n), k=math.floor(test_ratio * n)))

    train_indices = []
    test_indices = []
    train_values = []
    test_values = []

    for i in range(n):
        if i in indices_for_test_set:
            test_indices.append(indices[i])
            test_values.append(values[i])
        else:
            train_indices.append(indices[i])
            train_values.append(values[i])

    return train_indices, train_values, test_indices, test_values


In [196]:
class ReportTechniqueMatrix:
    # Abstraction function:
    #	AF(indices, values, report_ids, technique_ids) = a sparse matrix A where
    #       A_{ij} = values[k] where k is the index for (i, j) in indices, if present.
    #       and A_{ij} corresponds to the report report_ids[i] and
    #       technique technique_ids[j]
    # Rep invariant:
    # - len(indices) > 0
    # - len(values) > 0
    # - # TODO every row contains value
    # - # TODO every column contains value
    # Safety from rep exposure:
    # - all fields in rep are private and immutable

    def __init__(self, indices: tuple[tuple[int]], values: tuple[int], report_ids: tuple[int], technique_ids: tuple[str]):
        """Initializes a ReportTechniqueMatrix object.

        Args:
            indices: iterable of indices of the format (row, column) of matrix entries
            values: iterable of matrix entry values such that values[i] contains the
                entry for indices[i] for all i.
            report_ids: unique identifiers for reports such that report_ids[i] is the
                identifier for row i of the sparse matrix.
            technique_ids: unique identifiers for techniques such that technique_ids[i]
                is the unique identifier for column j of the sparse matrix.
        """

        self._indices = tuple(indices)
        self._values = tuple(values)
        self._report_ids = tuple(report_ids)
        self._technique_ids = tuple(technique_ids)

        self._checkrep()

    def _checkrep(self):
        """Asserts the rep invariant."""
        # - len(indices) > 0
        assert len(self._indices) > 0
        # - len(values) > 0
        assert len(self._values) > 0

    @property
    def m(self):
        """The number of rows of the matrix."""
        self._checkrep()
        return len(self._report_ids)

    @property
    def n(self):
        """The number of columns of the matrix."""
        self._checkrep()
        return len(self._technique_ids)

    @property
    def shape(self) -> tuple[int]:
        """Gets the shape of the matrix."""
        return (self.m, self.n)

    @property
    def indices(self) -> tuple[tuple[int]]:
        """Gets the nonempty indices of the matrix."""
        # ok since immutable
        self._checkrep()
        return self._indices

    @property
    def technique_ids(self) -> tuple[str]:
        """Gets the technique ids that make up the column index of the matrix."""
        return self._technique_ids

    def to_sparse_tensor(self) -> tf.SparseTensor:
        """Converts the matrix to a sparse tensor."""
        self._checkrep()
        return tf.SparseTensor(
            indices=self._indices,
            values=self._values,
            dense_shape=(self.m, self.n)
        )

    def mask(self, indices: frozenset[tuple[int]]): # -> ReportTechniqueMatrix:
        """Generates a new ReportTechniqueMatrix object with only a subset of the indices.

        Args:
            indices: indices to include in the new object.

        Returns:
            A new ReportTechniqueMatrix object.
        """
        new_indices = []
        new_values = []

        for i in range(len(self._indices)):

            old_index = self._indices[i]

            if old_index in indices:

                old_value = self._values[i]
                new_indices.append(old_index)
                new_values.append(old_value)

        assert len(new_indices) == len(indices)
        assert len(new_values) == len(indices)

        self._checkrep()

        return ReportTechniqueMatrix(
            indices=new_indices,
            values=new_values,
            report_ids=self._report_ids,
            technique_ids=self._technique_ids
        )


In [197]:
def main():

    test_ratio = 0.1
    embedding_dimension = 10

    # want matrix of campaigns on horizontal, techniques on vertical
    all_mitre_technique_ids = frozenset(get_mitre_technique_ids("../enterprise-attack.json"))

    reports = get_campaign_techniques("../data/combined_dataset_full_frequency.json")

    # only techniques in reports:
    all_report_technique_ids = set()
    for report in reports:
        all_report_technique_ids.update(report)
    # some reports contain invalid techniques from ATT&CK v1
    technique_ids = tuple(all_mitre_technique_ids.intersection(all_report_technique_ids))

    # all techniques:
    # technique_ids = tuple(all_mitre_technique_ids)

    techniques_to_index = {technique_ids[i]: i for i in range(len(technique_ids))}

    indices = []
    values = []
    report_ids = tuple(range(len(reports)))

    # for each campaign, make a vector, filling in each present technique with a 1
    total_techniques = 0
    for i in range(len(reports)):

        report = reports[i]
        total_techniques += len(report)

        for mitre_technique_id in report:

            if mitre_technique_id in techniques_to_index:
                # campaign id, technique id
                index = (i, techniques_to_index[mitre_technique_id])

                indices.append(index)
                values.append(1)

    data = ReportTechniqueMatrix(
        indices=indices,
        values=values,
        report_ids=report_ids,
        technique_ids=technique_ids
    )

    train_indices = frozenset(random.sample(data.indices, k=math.floor((1-test_ratio) * len(data.indices))))
    test_indices = frozenset(data.indices).difference(train_indices)

    training_data = data.mask(train_indices)
    test_data = data.mask(test_indices)

    # train
    model = recommender.FactorizationRecommender(m=data.m, n=data.n, k=embedding_dimension)
    model.fit(training_data.to_sparse_tensor(), num_iterations=1000, learning_rate=10.)

    evaluation = model.evaluate(test_data.to_sparse_tensor())
    print("MSE", evaluation)

    predictions = model.predict()

    predictions_dataframe = pd.DataFrame(predictions, columns=data.technique_ids)

    print(predictions_dataframe)

In [198]:
main()

MSE 0.3504026
        T1003     T1105  T1568.002     T1586  T1590.004  T1583.008  T1021.004  \
0    1.000626  0.999537  -0.082496 -0.831297   0.308251  -0.923980   0.431186   
1    0.419154  0.992721  -0.188076 -1.553465  -0.971239  -0.408307   0.683225   
2    0.376486  0.999800  -0.398596 -0.192381   0.528564  -0.213793  -0.677462   
3   -0.198188  0.997095   0.398596 -1.360123  -1.970683  -0.460999   0.026622   
4    0.860726  0.999553  -0.149492 -0.838964  -0.028922  -0.434994   0.478017   
..        ...       ...        ...       ...        ...        ...        ...   
186 -0.126386  0.999941   0.294739  0.991404   1.006646   0.014913  -0.688125   
187  0.761755  0.996974  -0.881964  0.415451   1.815791  -0.963222   0.351476   
188  0.389008  0.998234   1.492830  0.954458   0.336659  -0.451666  -0.308993   
189  0.572022  0.998875   0.810578 -0.721470  -0.702849  -0.792149   0.990076   
190  1.111477  1.000288  -0.120146 -1.233313  -0.861061  -0.614102  -0.238031   

     T1027.00

In [199]:
# TODO

# remove technicques that are not present in any examples
# visualize what data is predicted vs original
# how?
#   - get a bunch of techniques in the dataset
#   - need to process to pandas dataframe with indices
#   - convert to data object which has to sparse tensor, to pandas, and to numpy functions
# regularization