In [220]:
%load_ext autoreload
%autoreload 2

# Imports
import json
from mitreattack.stix20 import MitreAttackData
import tensorflow as tf
import recommender
from matrix import ReportTechniqueMatrix
import random
import math
import importlib
import pandas as pd
import numpy as np

tf.config.run_functions_eagerly(True)

assert tf.executing_eagerly()

importlib.reload(recommender)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'recommender' from '/Users/mjturner/code/technique-inference-engine/models/recommender/__init__.py'>

In [221]:
def get_mitre_technique_ids_to_names(stix_filepath: str) -> dict[str, str]:
    """Gets all MITRE technique ids mapped to their description."""
    mitre_attack_data = MitreAttackData(stix_filepath)
    techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)

    all_technique_ids = {}

    for technique in techniques:
        external_references = technique.get("external_references")
        mitre_references = tuple(filter(lambda external_reference: external_reference.get("source_name") == "mitre-attack", external_references))
        assert len(mitre_references) == 1
        mitre_technique_id = mitre_references[0]["external_id"]
        all_technique_ids[mitre_technique_id] = technique.get("name")

    return all_technique_ids

def get_campaign_techniques(filepath: str) -> tuple[frozenset[str]]:
    """Gets a set of MITRE technique ids present in each campaign."""

    with open(filepath) as f:
        data = json.load(f)

    campaigns = data["bags_of_techniques"]

    ret = []

    for campaign in campaigns:

        techniques = campaign["mitre_techniques"]
        ret.append(frozenset(techniques.keys()))

    return ret

def train_test_split(indices: list, values: list, test_ratio: float=0.1) -> tuple:
    n = len(indices)
    assert len(values) == n

    indices_for_test_set = frozenset(random.sample(range(n), k=math.floor(test_ratio * n)))

    train_indices = []
    test_indices = []
    train_values = []
    test_values = []

    for i in range(n):
        if i in indices_for_test_set:
            test_indices.append(indices[i])
            test_values.append(values[i])
        else:
            train_indices.append(indices[i])
            train_values.append(values[i])

    return train_indices, train_values, test_indices, test_values


In [222]:
def view_prediction_performance_table_for_report(
        train_data: ReportTechniqueMatrix,
        test_data: ReportTechniqueMatrix,
        predictions: pd.DataFrame,
        report_id: int,
    ) -> pd.DataFrame:
    """Gets a dataframe to visualize the training data, test data, and predictions for a report."""
    # 1. training_data
    training_dataframe = train_data.to_pandas()
    report_train_techniques = training_dataframe.loc[report_id]
    report_train_techniques.name = "training_data"

    # 2. predictions
    predicted_techniques = predictions.loc[report_id]
    predicted_techniques.name = "predictions"

    # now test data
    test_dataframe = test_data.to_pandas()
    report_test_techniques = test_dataframe.loc[report_id]
    report_test_techniques.name = "test_data"

    report_data = pd.concat((predicted_techniques, report_train_techniques, report_test_techniques), axis=1)

    # add name for convenience
    all_mitre_technique_ids_to_names = get_mitre_technique_ids_to_names("../enterprise-attack.json")
    report_data.loc[:, "technique_name"] = report_data.apply(lambda row: all_mitre_technique_ids_to_names.get(row.name), axis=1)

    return report_data


In [223]:
test_ratio = 0.1
embedding_dimension = 10

# want matrix of campaigns on horizontal, techniques on vertical
all_mitre_technique_ids_to_names = get_mitre_technique_ids_to_names("../enterprise-attack.json")

reports = get_campaign_techniques("../data/combined_dataset_full_frequency.json")

# only techniques in reports:
all_report_technique_ids = set()
for report in reports:
    all_report_technique_ids.update(report)
# some reports contain invalid techniques from ATT&CK v1
technique_ids = tuple(set(all_mitre_technique_ids_to_names.keys()).intersection(all_report_technique_ids))

techniques_to_index = {technique_ids[i]: i for i in range(len(technique_ids))}

indices = []
values = []
report_ids = tuple(range(len(reports)))

# for each campaign, make a vector, filling in each present technique with a 1
total_techniques = 0
for i in range(len(reports)):

    report = reports[i]
    total_techniques += len(report)

    for mitre_technique_id in report:

        if mitre_technique_id in techniques_to_index:
            # campaign id, technique id
            index = (i, techniques_to_index[mitre_technique_id])

            indices.append(index)
            values.append(1)

data = ReportTechniqueMatrix(
    indices=indices,
    values=values,
    report_ids=report_ids,
    technique_ids=technique_ids
)

train_indices = frozenset(random.sample(data.indices, k=math.floor((1-test_ratio) * len(data.indices))))
test_indices = frozenset(data.indices).difference(train_indices)

training_data = data.mask(train_indices)
test_data = data.mask(test_indices)

# train
model = recommender.FactorizationRecommender(m=data.m, n=data.n, k=embedding_dimension)
model.fit(training_data.to_sparse_tensor(), num_iterations=1000, learning_rate=10.)

evaluation = model.evaluate(test_data.to_sparse_tensor())

predictions = model.predict()

predictions_dataframe = pd.DataFrame(predictions, columns=data.technique_ids)

In [224]:
# get best and worst test performance
test_ndarray = test_data.to_numpy()
predictions_ndarray = predictions_dataframe.to_numpy()
# where test data, use predictions, else, fill with Nan
test_performance = np.mean(np.square(predictions_ndarray - test_ndarray), axis=1, where=test_ndarray > 0.5)

best_test_perf = np.nanargmin(test_performance, )
worst_test_perf = np.nanargmax(test_performance)

best_performance_results = view_prediction_performance_table_for_report(
    train_data=training_data,
    test_data=test_data,
    predictions=predictions_dataframe,
    report_id=best_test_perf
)

worst_performance_results = view_prediction_performance_table_for_report(
    train_data=training_data,
    test_data=test_data,
    predictions=predictions_dataframe,
    report_id=worst_test_perf
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [225]:
print(best_performance_results.sort_values("test_data", ascending=False).head(16))

           predictions  training_data  test_data  \
T1566.001     1.000168            0.0        1.0   
T1111         0.578138            0.0        0.0   
T1569.002     0.952383            0.0        0.0   
T1574.001    -0.118375            0.0        0.0   
T1490         0.677453            0.0        0.0   
T1087.001     1.176702            0.0        0.0   
T1071.001     1.000349            0.0        0.0   
T1543.002     1.866597            0.0        0.0   
T1195.002     1.390177            0.0        0.0   
T1561.002    -0.404996            0.0        0.0   
T1583.004     0.705703            0.0        0.0   
T1037.005     0.536079            0.0        0.0   
T1021         0.226719            0.0        0.0   
T1592.004     0.024238            0.0        0.0   
T1082         0.998659            0.0        0.0   
T1105         1.002009            1.0        0.0   

                                     technique_name  
T1566.001                  Spearphishing Attachment  
T1111  

In [226]:
print(worst_performance_results.sort_values("test_data", ascending=False).head(15))

           predictions  training_data  test_data  \
T1072        -0.702861            0.0        1.0   
T1111         0.915116            0.0        0.0   
T1569.002     1.001750            0.0        0.0   
T1574.001     0.996267            0.0        0.0   
T1490        -0.052513            0.0        0.0   
T1087.001     0.469129            0.0        0.0   
T1071.001     0.998453            0.0        0.0   
T1543.002     0.976140            0.0        0.0   
T1195.002     0.332150            0.0        0.0   
T1561.002     0.675625            0.0        0.0   
T1583.004     0.342131            0.0        0.0   
T1037.005     0.765916            0.0        0.0   
T1021         0.665344            0.0        0.0   
T1592.004    -0.035172            0.0        0.0   
T1082         0.999681            1.0        0.0   

                                     technique_name  
T1072                     Software Deployment Tools  
T1111      Multi-Factor Authentication Interception  
T1569

In [227]:
# TODO
# regularization