
### The Data as organized by the preprocessing:

| Column     | Description              |
|------------|--------------------------|
|`processId`|The unique identifier for the process that generated the event - int64 |
|`threadId`|ID for the thread spawning the log - int64|
|`parentProcessId`|Label for the process spawning this log - int64|
|`userId`|ID of user spawning the log|Numerical - int64|
|`mountNamespace`|Mounting restrictions the process log works within - int64|
|`argsNum`|Number of arguments passed to the event - int64|
|`returnValue`|Value returned from the event log (usually 0) - int64|
|`sus_label`|Binary label as suspicous event (1 is suspicious, 0 is not) - int64|


In [1]:
# Make sure to run this cell to use torchmetrics. If you cannot use pip install to install the torchmetrics, you can use sklearn.
!pip install torchmetrics

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as functional
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torchmetrics import Accuracy
# from sklearn.metrics import accuracy_score  # uncomment to use sklearn

In [3]:
# Load preprocessed data
train_df = pd.read_csv('labelled_train.csv')
test_df = pd.read_csv('labelled_test.csv')
val_df = pd.read_csv('labelled_validation.csv')

# View the first 5 rows of training set
train_df.head()

Unnamed: 0,processId,threadId,parentProcessId,userId,mountNamespace,argsNum,returnValue,sus_label
0,381,7337,1,100,4026532231,5,0,1
1,381,7337,1,100,4026532231,1,0,1
2,381,7337,1,100,4026532231,0,0,1
3,7347,7347,7341,0,4026531840,2,-2,1
4,7347,7347,7341,0,4026531840,4,0,1


## We can see that the features have different scales, a standardization is thus needed:

In [4]:
# Start coding here
# Use as many cells as you need
#The features have very different scales and need to go standard scaler:
train_features = train_df[train_df.columns[1:-1]]
train_labels = train_df[train_df.columns[-1]].to_numpy()
test_features = test_df[test_df.columns[1:-1]]
test_labels = test_df[test_df.columns[-1]].to_numpy()
val_features = val_df[val_df.columns[1:-1]]
val_labels = val_df[val_df.columns[-1]].to_numpy()


scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)
val_features = scaler.transform(val_features)



val_features[:5]

array([[-3.32423454, -0.84909243,  2.63843038,  1.79005541,  0.24455003,
        -0.00791259],
       [-3.32578327, -0.84909243,  2.61170424,  1.78399123,  0.24455003,
        -0.00791259],
       [-3.52040741, -0.84954378, -0.06090978, -0.58710151,  0.99031479,
        -0.0549941 ],
       [-3.52040741, -0.84954378, -0.06090978, -0.58710151,  0.99031479,
        -0.01732889],
       [-3.52040741, -0.84954378, -0.06090978, -0.58710151, -0.50121474,
        -0.0549941 ]])

## Build data loaders but later, due to the small size of the datasets, they won't be needed and the training sample can be passed as a unique **batch:**

In [5]:
#transform features and labels to torch tensors:

train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.float32).view(-1, 1)
test_features_tensor = torch.tensor(test_features, dtype=torch.float32)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.float32).view(-1, 1)
val_features_tensor = torch.tensor(val_features, dtype=torch.float32)
val_labels_tensor = torch.tensor(val_labels, dtype=torch.float32).view(-1, 1)

#use TensorDatasets and DataLoaders. Won't be used later due to the simpleness of the data
dataset_train = TensorDataset(train_features_tensor,train_labels_tensor)
dataset_test = TensorDataset(test_features_tensor,test_labels_tensor)
dataset_val =TensorDataset(val_features_tensor,val_labels_tensor)

train_dataloader = DataLoader(
    dataset_train, shuffle=True, batch_size=1,
)
test_dataloader = DataLoader(
    dataset_test, shuffle=True, batch_size=1,
)
val_dataloader = DataLoader(
    dataset_val, shuffle=True, batch_size=1,
)

## Building a simple model: ReLU/ELU activation functions were tried but happen to only slow/disturb the training. Removed.

In [6]:
#Build our model
num_features = train_features_tensor.shape[1]
model = nn.Sequential(nn.Linear(num_features,256),
                      nn.Linear(256,128),
                      nn.Linear(128,1),
                      nn.Sigmoid()
                     )

## Onto the training loop: use binary cross entropy as a loss function and Adam as an optimizer with a learning rate of 0.001

In [7]:
#train:
# Define the loss function
criterion = nn.BCELoss()
# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

num_epochs = 20
# Loop over the number of epochs and the dataloader
for epoch in range(num_epochs):
    #running_loss = 0.
    optimizer.zero_grad()
    prediction = model(train_features_tensor)
    loss = criterion(prediction, train_labels_tensor)
    loss.backward()
    optimizer.step()
#    for data in train_dataloader:
#        # Set the gradients to zero
#        optimizer.zero_grad()
#        # Run a forward pass
#        feature, target = data
#        #print(feature.to(torch.float32))
#        prediction = model(feature)    
#        #print(prediction.squeeze().size(), target.size())
#        # Calculate the loss
#        loss = criterion(prediction, target)    
#        # Compute the gradients
#        loss.backward()
#        # Update the model's parameters
#        optimizer.step()
#        running_loss += loss.item()
#    epoch_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 0.6385
Epoch 2, Loss: 0.5084
Epoch 3, Loss: 0.3991
Epoch 4, Loss: 0.3094
Epoch 5, Loss: 0.2373
Epoch 6, Loss: 0.1805
Epoch 7, Loss: 0.1365
Epoch 8, Loss: 0.1027
Epoch 9, Loss: 0.0772
Epoch 10, Loss: 0.0581
Epoch 11, Loss: 0.0439
Epoch 12, Loss: 0.0336
Epoch 13, Loss: 0.0260
Epoch 14, Loss: 0.0204
Epoch 15, Loss: 0.0163
Epoch 16, Loss: 0.0133
Epoch 17, Loss: 0.0110
Epoch 18, Loss: 0.0092
Epoch 19, Loss: 0.0079
Epoch 20, Loss: 0.0069


## Evaluate the performance on the test and validation samples:

In [8]:
#estimate accuracy for test and valid samples:
accu = Accuracy(threshold=0.5, task = 'binary')

test_preds = model(test_features_tensor)
#val_preds = val_preds > 0.5
test_accuracy = accu(test_preds,test_labels_tensor).item()
print("test accuracy: ", test_accuracy)
val_preds = model(val_features_tensor)
#val_preds = val_preds > 0.5
val_accuracy = accu(val_preds,val_labels_tensor).item()
print("val_accuracy: ", val_accuracy)

test accuracy:  0.9459800124168396
val_accuracy:  0.9999523758888245


## The accuracies are very high, let's see in terms of precision and recall:

In [9]:
from torchmetrics import Precision, Recall
precision = Precision(task="binary", threshold=0.5)
recall = Recall(task="binary", threshold=0.5)

test_precision = precision(test_preds,test_labels_tensor).item()
test_recall = recall(test_preds,test_labels_tensor).item()

print("test precision: ", test_precision,"recall: ",test_recall)

val_precision = precision(val_preds,val_labels_tensor).item()
val_recall = recall(val_preds,val_labels_tensor).item()

print("validation precision: ", val_precision,"recall: ",val_recall)



test precision:  1.0 recall:  0.9404639005661011
validation precision:  1.0 recall:  0.9885495901107788


## The performance on these figures of merits is also very good!