In [2]:
import pandas as pd

new_data = pd.read_json('../data/tram2-data/single_label.json')
new_data

Unnamed: 0,text,label,doc_title
0,This file extracts credentials from LSASS simi...,T1003.001,NotPetya Technical Analysis A Triple Threat F...
1,It calls OpenProcess on lsass.exe with access ...,T1003.001,NotPetya Technical Analysis A Triple Threat F...
2,It spreads to Microsoft Windows machines using...,T1210,NotPetya Technical Analysis A Triple Threat F...
3,SMB exploitation via EternalBlue,T1210,NotPetya Technical Analysis A Triple Threat F...
4,SMBv1 Exploitation via EternalBlue,T1210,NotPetya Technical Analysis A Triple Threat F...
...,...,...,...
5084,collects local files and information from the ...,T1005,AA21076A TrickBot Malware
5085,uses HTTPS to communicate with its C2 servers,T1071.001,AA21076A TrickBot Malware
5086,samples have used HTTP over ports 447 and 8082...,T1071.001,AA21076A TrickBot Malware
5087,downloads several additional files and saves t...,T1105,AA21076A TrickBot Malware


In [3]:
import json

with open('../data/training/bootstrap-training-data.json') as f:
    old_data_json = json.loads(f.read())

old_data = pd.DataFrame(
    [
        {'text': row['text'], 'label': row['mappings'][0]['attack_id']}
        for row in old_data_json['sentences']
        if len(row['mappings']) > 0
    ]
)

ALL_CLASSES = set(new_data['label'].unique())

old_data['label'] = old_data['label'].apply(lambda s: s if s in ALL_CLASSES else s[:-4] if s[:-4] in ALL_CLASSES else None)

old_data

Unnamed: 0,text,label
0,"From these reports, we know that the group use...",
1,"We believe this access was abused, for example...",
2,What does the resulting watering hole look lik...,
3,This targeting of third party organizations to...,
4,Online news outlets and general websites were ...,
...,...,...
1520,emond,
1521,CPL files,
1522,.cpl,
1523,application shim,


In [14]:
all_data = (
    pd.concat(
        (new_data.drop(columns='doc_title'), old_data)
    )
    .sample(frac=1)
    .reset_index(drop=True)
)

all_data

Unnamed: 0,text,label
0,The Result Retriever module can AES encrypt C...,T1573.001
1,A dropper used by installs itself into the AS...,T1547.001
2,accessible through API functions.,T1106
3,The adversary uses the command and control cha...,T1041
4,has used encryption and base64 to obfuscate it...,T1027
...,...,...
16214,The second scheduled task created attempts to ...,T1053.005
16215,loads and executes the decrypted,T1140
16216,"used a compiled remote desktop malware,",T1021.001
16217,uses Rundll32 to load a malicious DLL.,T1218.011


In [15]:
import transformers
import torch

mode: 'bert or gpt' = 'bert'
cuda = torch.device('cuda')

if mode == 'bert':
    model = transformers.BertForSequenceClassification.from_pretrained(
        "allenai/scibert_scivocab_uncased",
        num_labels=all_data['label'].nunique(),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = transformers.BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", max_length=512)
elif mode == 'gpt':
    model = transformers.GPT2ForSequenceClassification.from_pretrained(
        "gpt2",
        num_labels=all_data['label'].nunique(),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2", max_length=512)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
else:
    raise ValueError(f"mode must be one of bert or gpt, but is {mode = !r}")

model.train().to(cuda)



Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
from sklearn.preprocessing import OneHotEncoder as OHE

encoder = OHE(sparse_output=False)
encoder.fit(all_data[['label']])

def tokenize(samples: 'list[str]'):
    return tokenizer(samples, return_tensors='pt', padding='max_length', truncation=True, max_length=512).input_ids

def load_data(x, y, batch_size=10):
    x_len, y_len = x.shape[0], y.shape[0]
    assert x_len == y_len
    for i in range(0, x_len, batch_size):
        slc = slice(i, i + batch_size)
        yield x[slc].to(cuda), y[slc].to(cuda)

def apply_attention_mask(x):
    return x.ne(tokenizer.pad_token_id).to(int)


In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(all_data, test_size=.2, stratify=all_data['label'])

x_train = tokenize(train['text'].tolist())
x_train

tensor([[  102,   434,   501,  ...,     0,     0,     0],
        [  102,  3017,   111,  ...,     0,     0,     0],
        [  102,  7042, 12888,  ...,     0,     0,     0],
        ...,
        [  102,  7208,   220,  ...,     0,     0,     0],
        [  102, 26255, 30113,  ...,     0,     0,     0],
        [  102,   300,  9921,  ...,     0,     0,     0]])

In [20]:
y_train = torch.Tensor(encoder.transform(train[['label']]))
y_train

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
from torch.optim import AdamW
from tqdm import tqdm
from statistics import mean

optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

for epoch in range(6):
    epoch_losses = []
    for x, y in tqdm(load_data(x_train, y_train, batch_size=10)):
        out = model(x, attention_mask=apply_attention_mask(x), labels=y)
        epoch_losses.append(out.loss.item())
        out.loss.backward()
        optim.step()
    print(f"epoch {epoch + 1} loss: {mean(epoch_losses)}")


89it [01:17,  1.23it/s]

In [None]:
model.eval()

preds = []
batch_size = 20

for i in range()