# Fasttext and Logistic Regression, 20 April 2023
This notebook applies two of the models that were applied in the previous iteration of TRAM.

In [1]:
import json
import pandas as pd

data_path = '/projects/TRAM2023/tram-private/data/training/refreshed_dataset_march_2023.json'
with open(data_path) as f:
    data = json.loads(f.read())

raw = pd.DataFrame(data['sentences'])
mappings = raw['mappings'].explode().dropna().apply(pd.Series)
df = pd.concat((raw['text'], mappings['technique_name'], mappings['attack_id'].str.extract(r"(?P<attack_id>T\d+)\.(?P<subclass_id>\d+)")), axis=1)
df

Unnamed: 0,text,technique_name,attack_id,subclass_id
0,has overwritten the function pointer in the ex...,Extra Window Memory Injection,T1055,011
1,overwrites Explorers Shell_TrayWnd extra windo...,Extra Window Memory Injection,T1055,011
2,has used scheduled tasks to maintain persistence.,Scheduled Task,T1053,005
3,has the ability to launch scheduled tasks to e...,Scheduled Task,T1053,005
4,has used scheduled tasks to maintain persistence.,Scheduled Task,T1053,005
...,...,...,...,...
24599,"""My God"" was one of the first songs recorded b...",,,
24600,It initially had seven students.,,,
24601,Vellarikundu is a hillside town and taluk head...,,,
24602,This earned the score a parental advisory warn...,,,


In [2]:
classes_of_interest = ['T1041', 'T1106', 'T1082', 'T1033', 'T1112', 'T1070', 'T1090', 'T1021', 'T1218', 'T1095', 'T1548', 'T1053', 'T1071', 'T1574', 'T1562', 'T1204', 'T1012', 'T1140', 'T1055', 'T1105', 'T1552', 'T1486', 'T1083', 'T1078', 'T1047', 'T1190', 'T1543', 'T1113', 'T1003', 'T1059', 'T1057', 'T1027', 'T1219', 'T1036', 'T1005']
positive_data = df[df['attack_id'].isin(classes_of_interest)]
negative_data = df[df['attack_id'].isna()].sample(1000).fillna('none')
data = pd.concat((positive_data, negative_data)).reset_index(drop=True)
data

Unnamed: 0,text,technique_name,attack_id,subclass_id
0,has overwritten the function pointer in the ex...,Extra Window Memory Injection,T1055,011
1,overwrites Explorers Shell_TrayWnd extra windo...,Extra Window Memory Injection,T1055,011
2,has used scheduled tasks to maintain persistence.,Scheduled Task,T1053,005
3,has the ability to launch scheduled tasks to e...,Scheduled Task,T1053,005
4,has used scheduled tasks to maintain persistence.,Scheduled Task,T1053,005
...,...,...,...,...
4522,The men's S8 50 meters freestyle competition o...,none,none,none
4523,Network data encryption Both trojans encrypt t...,none,none,none
4524,has the ability to generate new C2 domains.,Fallback Channels,none,none
4525,may create a file containing the results of th...,System Network Configuration Discovery,none,none


The first model, fasttext, requires the training data to be inputted in an unusual way, namely by reading a text file where each line is a training instance, and the line starts with `__label__TAG`, where `TAG` is the tag associated with that line.

In [3]:
data_text = '__label__' + data['attack_id'] + ' ' + data['text'].str.replace('\n', ' ')
data['fasttext_text'] = data_text
data_text

0       __label__T1055 has overwritten the function po...
1       __label__T1055 overwrites Explorers Shell_Tray...
2       __label__T1053 has used scheduled tasks to mai...
3       __label__T1053 has the ability to launch sched...
4       __label__T1053 has used scheduled tasks to mai...
                              ...                        
4522    __label__none The men's S8 50 meters freestyle...
4523    __label__none Network data encryption Both tro...
4524    __label__none has the ability to generate new ...
4525    __label__none may create a file containing the...
4526    __label__none It is most known for its associa...
Length: 4527, dtype: object

In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.2, shuffle=True)

In [5]:
train_class_counts = train_df['attack_id'].value_counts(dropna=False)
test_class_counts = test_df['attack_id'].value_counts(dropna=False)
pd.concat({'train': train_class_counts, 'test': test_class_counts}, axis=1)

Unnamed: 0_level_0,train,test
attack_id,Unnamed: 1_level_1,Unnamed: 2_level_1
none,817,183
T1059,541,165
T1071,303,94
T1070,297,71
T1036,190,54
T1204,173,48
T1218,171,37
T1027,164,36
T1053,128,29
T1543,120,33


In [6]:
from pathlib import Path
import fasttext

train_data_file = Path('train_data.txt')
train_data_file.write_text(train_df['fasttext_text'].str.cat(sep='\n'))

model = fasttext.train_supervised(input=str(train_data_file), epoch=200, lr=1.0, wordNgrams=2)

Read 0M words
Number of words:  9723
Number of labels: 19
Progress: 100.0% words/sec/thread:  290784 lr:  0.000000 avg.loss:  0.086864 ETA:   0h 0m 0s


The way that the predictions are outputted is also unusual, so extra code is required to transform them into an ideal structure.

In [7]:
model.predict("hello world")

(('__label__none',), array([0.57492971]))

In [8]:
preds = test_df['text'].str.replace('\n', ' ').apply(model.predict).apply(pd.Series).rename({0: 'prediction', 1: 'confidence'}, axis=1)
preds['prediction'] = preds['prediction'].apply(lambda x: x[0].removeprefix('__label__'))
preds['confidence'] = preds['confidence'].apply(lambda x: x[0])
preds

Unnamed: 0,prediction,confidence
4066,T1070,0.616549
1751,T1562,0.877122
1580,T1059,1.000010
4168,none,0.999513
1091,T1562,0.999987
...,...,...
2678,T1071,1.000010
2220,T1059,1.000010
2538,T1070,1.000010
4046,none,0.984438


In [9]:
from sklearn.metrics import precision_recall_fscore_support as calculate_score

def calculate_scores_df(actual: list[str], predicted: list[str]):
    scores = calculate_score(actual, predicted)
    scores_df = pd.DataFrame(scores).T
    scores_df.columns = ['P', 'R', 'F1', '#']
    scores_df.index = sorted(set(actual) | set(predicted))
    scores_df.loc['(micro)'] = calculate_score(actual, predicted, average='micro')
    scores_df.loc['(macro)'] = calculate_score(actual, predicted, average='macro')
    return scores_df

In [10]:
fasttext_scores = calculate_scores_df(test_df['attack_id'].tolist(), preds['prediction'].tolist())
fasttext_scores

Unnamed: 0,P,R,F1,#
T1003,0.961538,0.961538,0.961538,26.0
T1021,0.84,0.724138,0.777778,29.0
T1027,0.744186,0.888889,0.810127,36.0
T1036,0.843137,0.796296,0.819048,54.0
T1053,0.888889,0.827586,0.857143,29.0
T1055,0.894737,0.772727,0.829268,22.0
T1059,0.89881,0.915152,0.906907,165.0
T1070,0.78481,0.873239,0.826667,71.0
T1071,0.978022,0.946809,0.962162,94.0
T1078,1.0,0.5,0.666667,4.0


The technical report for the previous iteration of TRAM says this about the parameters used for logistic regression.

> The logistic regression parameter settings include: using the Scikit Learning CountVectorizer as a feature generator, with a minimum document frequency for any given word of 3, removing “stopwords,” and retaining the case of words as they appeared in the input text (that is, not normalizing to a common case, such as lowercase).

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(lowercase=False, stop_words='english', min_df=3)
count_vectorizer

In [12]:
result = count_vectorizer.fit_transform(["this is the example sentence.", "this is the next example sentence", "this is the last example sentence"])
count_vectorizer.get_feature_names_out()

array(['example', 'sentence'], dtype=object)

Note that "example" and "sentence" are the only words considered by the count vectorizer because "this is the" are all stop words, and "next" and "last" have a document frequency of only one each (less than three). We will re-fit the count vectorizer on the actual data in the next cell.

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(count_vectorizer.fit_transform(train_df['text']), train_df['attack_id'])
preds = lr.predict(count_vectorizer.transform(test_df['text']))
lr_scores = calculate_scores_df(test_df['attack_id'], preds)
lr_scores

Unnamed: 0,P,R,F1,#
T1003,1.0,0.846154,0.916667,26.0
T1021,0.869565,0.689655,0.769231,29.0
T1027,0.794118,0.75,0.771429,36.0
T1036,0.808511,0.703704,0.752475,54.0
T1053,0.96,0.827586,0.888889,29.0
T1055,1.0,0.863636,0.926829,22.0
T1059,0.932515,0.921212,0.926829,165.0
T1070,0.871429,0.859155,0.865248,71.0
T1071,0.956989,0.946809,0.951872,94.0
T1078,1.0,0.5,0.666667,4.0


In [14]:
both = pd.concat({'fasttext': fasttext_scores, 'logistic_regression': lr_scores}, axis=1)
both

Unnamed: 0_level_0,fasttext,fasttext,fasttext,fasttext,logistic_regression,logistic_regression,logistic_regression,logistic_regression
Unnamed: 0_level_1,P,R,F1,#,P,R,F1,#
T1003,0.961538,0.961538,0.961538,26.0,1.0,0.846154,0.916667,26.0
T1021,0.84,0.724138,0.777778,29.0,0.869565,0.689655,0.769231,29.0
T1027,0.744186,0.888889,0.810127,36.0,0.794118,0.75,0.771429,36.0
T1036,0.843137,0.796296,0.819048,54.0,0.808511,0.703704,0.752475,54.0
T1053,0.888889,0.827586,0.857143,29.0,0.96,0.827586,0.888889,29.0
T1055,0.894737,0.772727,0.829268,22.0,1.0,0.863636,0.926829,22.0
T1059,0.89881,0.915152,0.906907,165.0,0.932515,0.921212,0.926829,165.0
T1070,0.78481,0.873239,0.826667,71.0,0.871429,0.859155,0.865248,71.0
T1071,0.978022,0.946809,0.962162,94.0,0.956989,0.946809,0.951872,94.0
T1078,1.0,0.5,0.666667,4.0,1.0,0.5,0.666667,4.0
