# Lab 4, Exercise 3

In [126]:
import numpy as np
import pandas as pd
import sys
import os

## Load data 

The data is separated into three folders: Attack_Data_Master, Training_Data_Master, and Validation_Data_Master
These can be found here:
data/exercise3/Training_Data_Master
data/exercise3/Validation_Data_Master
data/exercise3/Attack_Data_Master

All of the data in Training_Data_Master and Validation_Data_Master is normal, 
and all the data in Attack_Data_Master is malicious

For the purpose of this exercise, you will ignore the predefined training/validation splits, and simply use Training_Data_Master
and Validation_Data_Master as a single pool of normal data

As mentioned, each system call trace is stored as a single file.  Treat each system call trace as a separate datapoint.

In [135]:
# Load all the normal system call traces (i.e., everything in Training_Data_Master and Validation_Data_Master)

# CODE HERE
data_benign = []
data_all = []
data_malicious = []
is_malicious = []
path_training = 'data/exercise3/Training_Data_Master/'
for filename in os.listdir(path_training):
    with open(path_training + filename, 'r') as f:
        data = f.read()
        # add a 0 in front of each single digit number
        for i in range(10):
            data = data.replace(' ' + str(i) + ' ', ' 0' + str(i) + ' ')
            data = data.replace(' ' + str(i) + ' ', ' 0' + str(i) + ' ')

        # if first or last numbers are single digit, add a 0 in front
        if data[1] == ' ':
            data = '0' + data
        if data[-2] == ' ':
            data = data[:-2] + ' 0' + data[-1:]
        
        data_all.append(data)
        data_benign.append(data)
        is_malicious.append(0)
path_validation = 'data/exercise3/Validation_Data_Master/'
for filename in os.listdir(path_validation):
    with open(path_validation + filename, 'r') as f:
        data = f.read()
        # add a 0 in front of each single digit number
        for i in range(10):
            data = data.replace(' ' + str(i) + ' ', ' 0' + str(i) + ' ')

        # if first or last numbers are single digit, add a 0 in front
        if data[1] == ' ':
            data = '0' + data
        if data[-2] == ' ':
            data = data[:-2] + ' 0' + data[-1:]

        data_all.append(data)
        data_benign.append(data)
        is_malicious.append(0)

# Load all the malicious system call traces (i.e., everything in Attack_Data_Master)
# CODE HERE
path_malicious = 'data/exercise3/Attack_Data_Master/'
for folder in os.listdir(path_malicious):
    folder_path = path_malicious + folder + '/'
    for filename in os.listdir(folder_path):
        with open(folder_path + filename, 'r') as f:
            data = f.read()
            # add a 0 in front of each single digit number
            for i in range(10):
                data = data.replace(' ' + str(i) + ' ', ' 0' + str(i) + ' ')

            # if first or last numbers are single digit, add a 0 in front
            if data[1] == ' ':
                data = '0' + data
            if data[-2] == ' ':
                data = data[:-2] + ' 0' + data[-1:]
            
            data_all.append(data)
            data_malicious.append(data)
            is_malicious.append(1)

# Hint: A useful way to load this is as one or two Python lists, where each entry in the list corresponds to the text string
#       of system calls ids; feel free to use a single list for all the data, or separate lists for malicious versus normal
#       data

## Feature extraction

Tokenize and create a dataset where each datapoint corresponds to (normalized) counts of 
system call n-grams. Try various sizes of ngrams.

Reminder: A sequence of system call IDs that looks like this:
'6 6 63 6 42'

contains the following 3-grams:
'6 6 63'
'6 63 6'
'63 6 42'

Note: There are a number of ways you could code this up, but if you loaded the data
as lists of strings, you could consider using some of the feature extraction methods in 
sklearn.feature_extraction.text

In [166]:
# Look at the classdemo notebook for an example of doing this
# CODE HERE
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word', ngram_range=(3,3))  # character n-gram feature extraction

# Extract feature counts
raw_counts = count_vect.fit_transform(data_all)

features = count_vect.get_feature_names()

# print(raw_counts)
# print(raw_counts.toarray())
# print('Feature set: ' + str(features))
# print('Number of features: ' + str(len(features)))



## Create train/test split

In [168]:
# Use 50% of the data for the training set and the rest for the test set
# CODE HERE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(raw_counts, is_malicious, test_size=0.5, random_state=0)

## Train a classifier

In [169]:
# Please use Logistic Regression for this exercise
# Feel free to experiment with the various hyperparameters available to you in sklearn
# CODE HERE

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=100000).fit(X_train, y_train)

## Inference and results

In [170]:
# Run inference on the test data and predict labels for each data point in the test data
# CODE HERE
y_pred = clf.predict(X_test)

# Calculate and print the following metrics: precision, recall, f1-measure, and accuracy
# CODE HERE
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Precision: ' + str(precision_score(y_test, y_pred)))
print('Recall: ' + str(recall_score(y_test, y_pred)))
print('F1-measure: ' + str(f1_score(y_test, y_pred)))
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))

Precision: 0.7857142857142857
Recall: 0.8725212464589235
F1-measure: 0.8268456375838926
Accuracy: 0.9566532258064516


In [173]:
for i in range(1, 20):
    count_vect = CountVectorizer(analyzer='word', ngram_range=(i,i))  # character n-gram feature extraction
    raw_counts = count_vect.fit_transform(data_all)

    X_train, X_test, y_train, y_test = train_test_split(raw_counts, is_malicious, test_size=0.5, random_state=0)
    clf = LogisticRegression(max_iter=100000).fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print('N-gram: ' + str(i))
    print('Precision: ' + str(precision_score(y_test, y_pred)))
    print('Recall: ' + str(recall_score(y_test, y_pred)))
    print('F1-measure: ' + str(f1_score(y_test, y_pred)))
    print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))
    print()

N-gram: 1
Precision: 0.7204030226700252
Recall: 0.8101983002832861
F1-measure: 0.7626666666666666
Accuracy: 0.9401881720430108

N-gram: 2
Precision: 0.8010899182561307
Recall: 0.8328611898016998
F1-measure: 0.8166666666666667
Accuracy: 0.9556451612903226

N-gram: 3
Precision: 0.7857142857142857
Recall: 0.8725212464589235
F1-measure: 0.8268456375838926
Accuracy: 0.9566532258064516

N-gram: 4
Precision: 0.7663316582914573
Recall: 0.8640226628895185
F1-measure: 0.8122503328894808
Accuracy: 0.9526209677419355

N-gram: 5
Precision: 0.780281690140845
Recall: 0.7847025495750708
F1-measure: 0.7824858757062146
Accuracy: 0.948252688172043

N-gram: 6
Precision: 0.8421052631578947
Recall: 0.7252124645892352
F1-measure: 0.7792998477929985
Accuracy: 0.9512768817204301

N-gram: 7
Precision: 0.8467153284671532
Recall: 0.6572237960339944
F1-measure: 0.7400318979266348
Accuracy: 0.9452284946236559

N-gram: 8
Precision: 0.8484848484848485
Recall: 0.6345609065155807
F1-measure: 0.7260940032414911
Accuracy

# Part 2: Varying class priors

Create several new test datasets where you have randomly subsampled the number of 
attack datapoints.

In particular, create the following datasets:
- 10 datasets where 25% of the attack datapoints are removed from the original test set
- 10 datasets where 50% of the attack datapoints are removed from the original test set
- 10 datasets where 75% of the attack datapoints are removed from the original test set
- 10 datasets where 90% of the attack datapoints are removed from the original test set
- 10 datasets where 95% of the attack datapoints are removed from the original test set

Report five sets of precision, recall, f1-measure, and accuracy corresponding to the following:
- Average precision, recall, f1-measure, accuracy for datasets where 25% of attack datapoints removed
- Average precision, recall, f1-measure, accuracy for datasets where 50% of attack datapoints removed
- Average precision, recall, f1-measure, accuracy for datasets where 75% of attack datapoints removed
- Average precision, recall, f1-measure, accuracy for datasets where 90% of attack datapoints removed
- Average precision, recall, f1-measure, accuracy for datasets where 95% of attack datapoints removed

Note: You will use the same model trained in part 1 for all of these datasets.  
All you are varying is the class priors during the inference stage.

In [159]:
# Create subsets of the test set by randomly discarding X% of points with label +1
# CODE HERE
import random

datasets_25 = []
datasets_25_y = []
for i in range(10):
    new_malicious_data = random.sample(data_malicious, int(len(data_malicious) * 0.25))
    new_set = []
    new_set = np.concatenate((data_benign, new_malicious_data))
    
    if i == 0:
        datasets_25_y = np.concatenate(([0] * len(data_benign), [1] * len(new_malicious_data)))

    datasets_25.append(new_set)   

datasets_50 = []
datasets_50_y = []
for i in range(10):
    new_malicious_data = random.sample(data_malicious, int(len(data_malicious) * 0.50))
    new_set = []
    new_set = np.concatenate((data_benign, new_malicious_data))
    
    if i == 0:
        datasets_50_y = np.concatenate(([0] * len(data_benign), [1] * len(new_malicious_data)))

    datasets_50.append(new_set)

datasets_75 = []
datasets_75_y = []
for i in range(10):
    new_malicious_data = random.sample(data_malicious, int(len(data_malicious) * 0.75))
    new_set = []
    new_set = np.concatenate((data_benign, new_malicious_data))
    
    if i == 0:
        datasets_75_y = np.concatenate(([0] * len(data_benign), [1] * len(new_malicious_data)))

    datasets_75.append(new_set)

datasets_90 = []
datasets_90_y = []
for i in range(10):
    new_malicious_data = random.sample(data_malicious, int(len(data_malicious) * 0.90))
    new_set = []
    new_set = np.concatenate((data_benign, new_malicious_data))
    
    if i == 0:
        datasets_90_y = np.concatenate(([0] * len(data_benign), [1] * len(new_malicious_data)))

    datasets_90.append(new_set)

datasets_95 = []
datasets_95_y = []
for i in range(10):
    new_malicious_data = random.sample(data_malicious, int(len(data_malicious) * 0.95))
    new_set = []
    new_set = np.concatenate((data_benign, new_malicious_data))
    
    if i == 0:
        datasets_95_y = np.concatenate(([0] * len(data_benign), [1] * len(new_malicious_data)))

    datasets_95.append(new_set)

In [164]:
def get_metrics(x, y):
    raw_counts = count_vect.fit_transform(x)
    X_train, X_test, y_train, y_test = train_test_split(raw_counts, y, test_size=0.5, random_state=0)
    clf = LogisticRegression(max_iter=100000).fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return [precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), accuracy_score(y_test, y_pred)]

# 25%
precisions = []
recalls = []
f1s = []
accuracies = []
for i in range(10):
    metrics = get_metrics(datasets_25[i], datasets_25_y)
    precisions.append(metrics[0])
    recalls.append(metrics[1])
    f1s.append(metrics[2])
    accuracies.append(metrics[3])

print('25%')
print('Precision: ' + str(np.mean(precisions)))
print('Recall: ' + str(np.mean(recalls)))
print('F1-measure: ' + str(np.mean(f1s)))
print('Accuracy: ' + str(np.mean(accuracies)))

# 50%
precisions = []
recalls = []
f1s = []
accuracies = []
for i in range(10):
    metrics = get_metrics(datasets_50[i], datasets_50_y)
    precisions.append(metrics[0])
    recalls.append(metrics[1])
    f1s.append(metrics[2])
    accuracies.append(metrics[3])

print('50%')
print('Precision: ' + str(np.mean(precisions)))
print('Recall: ' + str(np.mean(recalls)))
print('F1-measure: ' + str(np.mean(f1s)))
print('Accuracy: ' + str(np.mean(accuracies)))

# 75%
precisions = []
recalls = []
f1s = []
accuracies = []
for i in range(10):
    metrics = get_metrics(datasets_75[i], datasets_75_y)
    precisions.append(metrics[0])
    recalls.append(metrics[1])
    f1s.append(metrics[2])
    accuracies.append(metrics[3])

print('75%')
print('Precision: ' + str(np.mean(precisions)))
print('Recall: ' + str(np.mean(recalls)))
print('F1-measure: ' + str(np.mean(f1s)))
print('Accuracy: ' + str(np.mean(accuracies)))

# 90%
precisions = []
recalls = []
f1s = []
accuracies = []
for i in range(10):
    metrics = get_metrics(datasets_90[i], datasets_90_y)
    precisions.append(metrics[0])
    recalls.append(metrics[1])
    f1s.append(metrics[2])
    accuracies.append(metrics[3])

print('90%')
print('Precision: ' + str(np.mean(precisions)))
print('Recall: ' + str(np.mean(recalls)))
print('F1-measure: ' + str(np.mean(f1s)))
print('Accuracy: ' + str(np.mean(accuracies)))

# 95%
precisions = []
recalls = []
f1s = []
accuracies = []
for i in range(10):
    metrics = get_metrics(datasets_95[i], datasets_95_y)
    precisions.append(metrics[0])
    recalls.append(metrics[1])
    f1s.append(metrics[2])
    accuracies.append(metrics[3])

print('95%')
print('Precision: ' + str(np.mean(precisions)))
print('Recall: ' + str(np.mean(recalls)))
print('F1-measure: ' + str(np.mean(f1s)))
print('Accuracy: ' + str(np.mean(accuracies)))

25%
Precision: 0.6501269603107982
Recall: 0.6210526315789474
F1-measure: 0.6344260188813885
Accuracy: 0.9748516320474778
50%
Precision: 0.7358059349461118
Recall: 0.7522471910112359
F1-measure: 0.7434868265590182
Accuracy: 0.9668698458228755
75%
Precision: 0.7583795825859747
Recall: 0.8333333333333334
F1-measure: 0.7939675055844712
Accuracy: 0.9599236641221374
90%
Precision: 0.7941176265900957
Recall: 0.8379939209726445
F1-measure: 0.8153137897773742
Accuracy: 0.9574880871341047
95%
Precision: 0.8035611860788896
Recall: 0.8463343108504399
F1-measure: 0.8242989025372921
Accuracy: 0.9584037876225906


# Questions

1) In Part 1, what size of ngrams gives the best performance? What are the tradeoffs as you change the size?

n=3 gives the highest f1 score.
As n increases, the precision increases, but the recall decreases, due to the loss of generality. The accuracy also decreases.

2) In Part 1, how does performance change if we use simple counts as features (i.e., 1-grams) as opposed to counts of 2-grams? What does this tell you about the role of sequences in prediction for this dataset?

There is a lower score in all aspects when using 1-grams. This tells us that the sequences are important to the predictions for this dataset. We would have trouble predicting the correct class if we only looked at 1 system call at a time.

3) How does performance change as a function of class prior in Part 2?

As the class prior increases, the F1 scores increases. The precision and recall also increases, while the accuracy decreases. As more attack data is removed, the model is more likely to predict the normal class.