# Import all the required libraries

In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
import zipfile
import ast
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from keras.regularizers import l2
from keras.optimizers import Adam
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.utils import shuffle

# The Dataset

## Download the dataset

In [2]:
!wget https://raw.githubusercontent.com/verazuo/a-labelled-version-of-the-ADFA-LD-dataset/refs/heads/master/ADFA-LD.zip

--2024-12-02 17:01:39--  https://raw.githubusercontent.com/verazuo/a-labelled-version-of-the-ADFA-LD-dataset/refs/heads/master/ADFA-LD.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2381193 (2.3M) [application/zip]
Saving to: ‘ADFA-LD.zip’


2024-12-02 17:01:39 (42.7 MB/s) - ‘ADFA-LD.zip’ saved [2381193/2381193]



## Clean and store the data in CSV file

### Unzip the dataset file

In [3]:
zip_file_path = "/content/ADFA-LD.zip"
# zip_file_path = "/kaggle/working/ADFA-LD.zip"

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall()

### Store the data in CSV file with respective attack label

In [4]:
#Paths to your dataset folders
train_data_dir = '/content/ADFA-LD/Training_Data_Master'
attack_data_dir = '/content/ADFA-LD/Attack_Data_Master'

# train_data_dir = '/kaggle/working/ADFA-LD/Training_Data_Master'
# attack_data_dir = '/kaggle/working/ADFA-LD/Attack_Data_Master'

#Initialize a list to store the data
data = []

# Function to read system calls from a text file
def read_system_calls(file_path):
    with open(file_path, 'r') as file:
        # Read, strip, split, and convert to integers
        system_calls = list(map(int, file.read().strip().split()))
    return system_calls

#Function to extract the base attack name (removes last "_" and numbers)
def extract_attack_name(attack_folder_name):
    return re.sub(r'_\d+$', '', attack_folder_name)  # Remove the last "_" and following digits

#Process normal system calls (label = 0)
for file_name in os.listdir(train_data_dir):
    file_path = os.path.join(train_data_dir, file_name)
    system_calls = read_system_calls(file_path)
    data.append([system_calls, 'normal', 0])  # Label 0 for normal system calls, 'normal' for the name

#Process attack system calls
#Mapping attack types to unique labels
attack_label_map = {}
current_label = 1


for attack_folder in os.listdir(attack_data_dir):
    attack_type = extract_attack_name(attack_folder)  #Extract the base attack name

    if attack_type not in attack_label_map:
        attack_label_map[attack_type] = current_label
        current_label += 1  #Increment the label for the next attack type

    attack_label = attack_label_map[attack_type]
    folder_path = os.path.join(attack_data_dir, attack_folder)

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        system_calls = read_system_calls(file_path)
        data.append([system_calls, attack_type, attack_label])  #Use the mapped label and attack type for this attack


#Create a DataFrame and save it as CSV
df = pd.DataFrame(data, columns=['System_Calls', 'Attack_Type', 'Label'])

#Save as CSV
df.to_csv('system_calls_with_labels.csv', index=False)



In [5]:
attack_label_map

{'Java_Meterpreter': 1,
 'Hydra_FTP': 2,
 'Hydra_SSH': 3,
 'Meterpreter': 4,
 'Adduser': 5,
 'Web_Shell': 6}

## Load validation data

In [6]:
val = []
# val_data_dir = "/kaggle/working/ADFA-LD/Validation_Data_Master"
val_data_dir = "/content/ADFA-LD/Validation_Data_Master"
for file_name in os.listdir(val_data_dir):
    file_path = os.path.join(val_data_dir, file_name)
    system_calls = read_system_calls(file_path)
    val.append(system_calls)

#Convert each integer to a string
val = [[str(num) for num in sublist] for sublist in val]

## Know the Dataset....

In [7]:
#Print unique attack types and label counts
label_counts = df['Attack_Type'].value_counts()
print("\nUnique attack types and their counts:")
print(label_counts)


Unique attack types and their counts:
Attack_Type
normal              833
Hydra_SSH           176
Hydra_FTP           162
Java_Meterpreter    124
Web_Shell           118
Adduser              91
Meterpreter          75
Name: count, dtype: int64


Data is imbalanced

# LSTM based binary classifier

## Load and process the dataset

In [8]:
df_binary = pd.read_csv('system_calls_with_labels.csv')

#Set Label to 1 where Label is not 0 for binary classification(label 0 - normal sequences, label 1 - attack sequences)
df_binary.loc[df['Label'] != 0, 'Label'] = 1

label_counts = df_binary['Label'].value_counts()
print("\nUnique attack types and their counts:")
print(label_counts)

#Convert the System_Calls from string representation to lists of strings, and strip spaces
df_binary['System_Calls'] = df_binary['System_Calls'].apply(lambda x: [elem.strip() for elem in x.strip('[]').split(',')])

#Drop the Attack_Type column
df_binary = df_binary.drop(columns=['Attack_Type'])

df_binary


Unique attack types and their counts:
Label
0    833
1    746
Name: count, dtype: int64


Unnamed: 0,System_Calls,Label
0,"[240, 311, 78, 240, 120, 221, 33, 120, 168, 24...",0
1,"[6, 78, 4, 118, 38, 197, 6, 196, 233, 196, 15,...",0
2,"[6, 11, 45, 33, 192, 33, 5, 197, 192, 6, 33, 5...",0
3,"[6, 91, 11, 45, 221, 221, 221, 33, 33, 192, 33...",0
4,"[174, 174, 174, 174, 174, 174, 221, 221, 221, ...",0
...,...,...
1574,"[3, 168, 168, 168, 265, 168, 3, 168, 265, 168,...",1
1575,"[162, 162, 162, 162, 114, 162, 162, 114, 162, ...",1
1576,"[142, 265, 265, 104, 3, 3, 175, 175, 175, 175,...",1
1577,"[168, 3, 168, 265, 168, 168, 265, 168, 168, 3,...",1


## Set aside some attack instances in validation dataset

In [9]:
#Sample attack instances for validation
num_attack_instances = 100
attack_df = df_binary[df_binary['Label'] != 0].sample(n=num_attack_instances, random_state=42)

#Remove sampled attack instances from the main dataset
df_binary = df_binary.drop(attack_df.index)

#Extract system calls and labels from the sampled attack instances
attack_traces = attack_df['System_Calls'].tolist()
attack_labels = attack_df['Label'].tolist()

val_binary = val.copy()

#Append attack instances to the validation dataset
val_binary.extend(attack_traces)

## Create custom Word2Vec model

### Create n-grams(n = 3 - trigram)

In [10]:
review_text = df_binary['System_Calls']

#Create bigrams
bigram = Phrases(review_text, min_count=10, threshold=5)  # Adjusted min_count and threshold
bigram_phraser = Phraser(bigram)

#Apply the bigram transformation
sequences_with_bigrams = [bigram_phraser[sequence] for sequence in review_text]

#Create trigrams
trigram = Phrases(sequences_with_bigrams, min_count=10, threshold=5)
trigram_phraser = Phraser(trigram)

#Apply trigrams
sequences_with_trigrams = [trigram_phraser[sequence] for sequence in sequences_with_bigrams]

### Define and train the Word2Vec model

In [11]:
custom_word2vec = Word2Vec(
    vector_size=100,   #Embedding size
    window=15,         #Context window size
    min_count=5,       #Minimum frequency for system calls to be included
    workers=10,        #Number of parallel workers
    sg=0               #cbow
)

custom_word2vec.build_vocab(sequences_with_trigrams)
custom_word2vec.train(sequences_with_trigrams, total_examples=len(sequences_with_trigrams), epochs=35)

print(custom_word2vec)

Word2Vec<vocab=149, vector_size=100, alpha=0.025>


## Create the training, testing and validation datasets

In [12]:
#Split data into training and testing
train_data, test_data = train_test_split(df_binary, test_size=0.20, random_state=42)

#Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['System_Calls'])
train_sequences = tokenizer.texts_to_sequences(train_data['System_Calls'])
test_sequences = tokenizer.texts_to_sequences(test_data['System_Calls'])

val_sequences_binary = tokenizer.texts_to_sequences(val_binary)

#Padding sequences
max_len = 1000  # 90th percentile of max length
X_train = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_len, padding='post')
y_train = train_data['Label']
y_test = test_data['Label']

X_val_binary = pad_sequences(val_sequences_binary, maxlen=max_len, padding='post')
y_val_binary = np.zeros(4372)

y_val_binary = np.append(y_val_binary, attack_labels)

#Shuffle validation data and labels
X_val_binary, y_val_binary = shuffle(X_val_binary, y_val_binary, random_state=42)

print("Training data dimensions:",X_train.shape)
print("Testing data dimensions:",X_test.shape)
print("Validation data dimensions:",X_val_binary.shape)

Training data dimensions: (1183, 1000)
Testing data dimensions: (296, 1000)
Validation data dimensions: (4472, 1000)


### Create embedding matrix

In [13]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

#Prepare embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in custom_word2vec.wv:
        embedding_matrix[i] = custom_word2vec.wv[word]

## Defining the model architecture

In [14]:
#Build Bidirectional LSTM model with Word2Vec embeddings
model1 = Sequential()

#Add the embedding layer using the pre-trained Word2Vec embeddings
model1.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                     weights=[embedding_matrix], input_length=1000, trainable=False))

#Add Bidirectional LSTM layers
model1.add(Bidirectional(LSTM(128, return_sequences=True)))  #Bidirectional LSTM layerr
model1.add(BatchNormalization())
model1.add(Dropout(0.5))  #Dropout for regularization
model1.add(Bidirectional(LSTM(64)))
model1.add(BatchNormalization())

#Add fully connected layers
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.4))

#Output layer
model1.add(Dense(1, activation='sigmoid'))

#Compile the model
optimizer = Adam(learning_rate=0.0005)
model1.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

#Print the model summary
model1.summary()



In [15]:
#generating model architecture diagram
# from keras.utils import plot_model

# # Assuming `model` is your Keras model
# plot_model(model1, to_file='model.png', show_shapes=True, show_layer_names=True)

In [16]:
model1.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/15
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 204ms/step - accuracy: 0.6018 - loss: 0.7136 - val_accuracy: 0.7500 - val_loss: 0.5754
Epoch 2/15
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 191ms/step - accuracy: 0.7852 - loss: 0.4503 - val_accuracy: 0.7736 - val_loss: 0.5051
Epoch 3/15
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 185ms/step - accuracy: 0.8370 - loss: 0.3426 - val_accuracy: 0.8176 - val_loss: 0.4534
Epoch 4/15
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 178ms/step - accuracy: 0.8750 - loss: 0.2801 - val_accuracy: 0.8345 - val_loss: 0.4023
Epoch 5/15
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 177ms/step - accuracy: 0.8767 - loss: 0.2830 - val_accuracy: 0.8412 - val_loss: 0.3719
Epoch 6/15
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 199ms/step - accuracy: 0.9028 - loss: 0.2385 - val_accuracy: 0.8480 - val_loss: 0.3360
Epoch 7/15
[1m19/19[0m [

<keras.src.callbacks.history.History at 0x7b0933caa4d0>

### Test the trained model on validation dataset

In [17]:
# Evaluate the model on the validation data
loss, accuracy = model1.evaluate(X_val_binary, y_val_binary, verbose=1)

print("Validation Accuracy:", accuracy)
print("Validation Loss:", loss)

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 56ms/step - accuracy: 0.7994 - loss: 0.8699
Validation Accuracy: 0.7989713549613953
Validation Loss: 0.8430337309837341


# LSTM based multi-class classifier

## Load and process the dataset

In [25]:
df_multi = pd.read_csv('system_calls_with_labels.csv')

label_counts = df_multi['Label'].value_counts()
print("\nUnique attack types and their counts:")
print(label_counts)

#Convert the System_Calls from string representation to lists of strings, and strip spaces
df_multi['System_Calls'] = df_multi['System_Calls'].apply(lambda x: [elem.strip() for elem in x.strip('[]').split(',')])

#Drop the Attack_Type column
df_multi = df_multi.drop(columns=['Attack_Type'])


Unique attack types and their counts:
Label
0    833
3    176
2    162
1    124
6    118
5     91
4     75
Name: count, dtype: int64


In [26]:
df_multi

Unnamed: 0,System_Calls,Label
0,"[240, 311, 78, 240, 120, 221, 33, 120, 168, 24...",0
1,"[6, 78, 4, 118, 38, 197, 6, 196, 233, 196, 15,...",0
2,"[6, 11, 45, 33, 192, 33, 5, 197, 192, 6, 33, 5...",0
3,"[6, 91, 11, 45, 221, 221, 221, 33, 33, 192, 33...",0
4,"[174, 174, 174, 174, 174, 174, 221, 221, 221, ...",0
...,...,...
1574,"[3, 168, 168, 168, 265, 168, 3, 168, 265, 168,...",5
1575,"[162, 162, 162, 162, 114, 162, 162, 114, 162, ...",5
1576,"[142, 265, 265, 104, 3, 3, 175, 175, 175, 175,...",5
1577,"[168, 3, 168, 265, 168, 168, 265, 168, 168, 3,...",5


### Set aside some attack instances(make val dataset)

In [27]:
X_val_multi = val.copy()
y_val_multi = np.zeros(4372)

#Separate attack instances (labels 1-6) from normal instances (label 0)
attack_data = df_multi[df_multi['Label'] != 0]

#Set aside 10% attack data for validation
attack_data_for_val = attack_data.sample(frac=0.1, random_state=42)

#Remove the attack data instances set aside for validation from the training set
df_multi = df_multi.drop(attack_data_for_val.index)

#add the attack system calls (from attack_data_for_val) to X_val_multi
X_val_multi.extend(attack_data_for_val['System_Calls'].tolist())

#add the attack labels (from attack_data_for_val) to y_val_multi
y_val_multi = np.append(y_val_multi, attack_data_for_val['Label'].values)

#Shuffle the X_val_multi and y_val_multi together
X_val_multi, y_val_multi = shuffle(X_val_multi, y_val_multi, random_state=42)

print("Updated validation data dimensions:")
print("attack_data_for_vali length:", len(attack_data_for_val))
print("X_val_multi length:", len(X_val_multi))
print("y_val_multi length:", len(y_val_multi))

Updated validation data dimensions:
attack_data_for_vali length: 75
X_val_multi length: 4447
y_val_multi length: 4447


### Train test split

In [28]:
#Split data into training and testing
train_data, test_data = train_test_split(df_multi, test_size=0.20, random_state=42)

#Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['System_Calls'])
train_sequences = tokenizer.texts_to_sequences(train_data['System_Calls'])
test_sequences = tokenizer.texts_to_sequences(test_data['System_Calls'])

val_sequences_multi = tokenizer.texts_to_sequences(X_val_multi)

#Padding sequences
max_len = 1000  # 90th percentile of max length
X_train = pad_sequences(train_sequences, maxlen=max_len, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_len, padding='post')
y_train = train_data['Label']
y_test = test_data['Label']


X_val_multi = pad_sequences(val_sequences_multi, maxlen=max_len, padding='post')

### Resampling the dataset

In [29]:
#Undersample class 0 to 350 samples in the training set
undersample = RandomUnderSampler(sampling_strategy={0: 350}, random_state=42)
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

#Oversample other classes to 280 samples in the training set
oversample = RandomOverSampler(sampling_strategy={1: 280, 2: 280, 3: 280, 4: 280, 5: 280, 6: 280}, random_state=42)
X_train_balanced, y_train_balanced = oversample.fit_resample(X_train_under, y_train_under)

#Display new class distribution in the training set
print("New class distribution in training set:", Counter(y_train_balanced))

New class distribution in training set: Counter({0: 350, 1: 280, 2: 280, 3: 280, 4: 280, 5: 280, 6: 280})


## Defining the model architecture

In [30]:
model2 = Sequential()

#Add the embedding layer using the pre-trained Word2Vec embeddings
model2.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                     weights=[embedding_matrix], input_length=1000, trainable=False))

#Add Bidirectional LSTM layers
model2.add(Bidirectional(LSTM(128, return_sequences=True)))
model2.add(BatchNormalization())
model2.add(Dropout(0.5))  # Dropout for regularization
model2.add(Bidirectional(LSTM(64)))
model2.add(BatchNormalization())

#Add fully connected layers
model2.add(Dense(64, activation='relu', kernel_regularizer=l2(0.04)))
model2.add(Dropout(0.4))

#Output layer
model2.add(Dense(7, activation='softmax'))

#Compile the model
optimizer = Adam(learning_rate=0.0005, clipvalue=1.0)
model2.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#Print the model summary
model2.summary()



In [31]:
model2.fit(X_train_balanced, y_train_balanced, epochs=30, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/30
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 205ms/step - accuracy: 0.2186 - loss: 5.6514 - val_accuracy: 0.4286 - val_loss: 4.7817
Epoch 2/30
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 182ms/step - accuracy: 0.3239 - loss: 4.6907 - val_accuracy: 0.4651 - val_loss: 4.2681
Epoch 3/30
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 181ms/step - accuracy: 0.4021 - loss: 4.1068 - val_accuracy: 0.5017 - val_loss: 3.8019
Epoch 4/30
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 185ms/step - accuracy: 0.4814 - loss: 3.6253 - val_accuracy: 0.5382 - val_loss: 3.3463
Epoch 5/30
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 182ms/step - accuracy: 0.5242 - loss: 3.2222 - val_accuracy: 0.5382 - val_loss: 2.9798
Epoch 6/30
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 180ms/step - accuracy: 0.5730 - loss: 2.8693 - val_accuracy: 0.5947 - val_loss: 2.6282
Epoch 7/30
[1m32/32[0m

<keras.src.callbacks.history.History at 0x7b08a96c6d10>

In [32]:
# # generating model architecture diagram
# from keras.utils import plot_model

# # Assuming `model` is your Keras model
# plot_model(model2, to_file='model2.png', show_shapes=True, show_layer_names=True)


### Test the trained model on validation dataset

In [35]:
print("Unique labels in y_val_multi:", np.unique(y_val_multi))

Unique labels in y_val_multi: [0. 1. 2. 3. 4. 5. 6.]


In [36]:
#Evaluate the model on the validation data
loss, accuracy = model2.evaluate(X_val_multi, y_val_multi, verbose=1)

print("Validation Accuracy:", accuracy)
print("Validation Loss:", loss)

[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 52ms/step - accuracy: 0.7769 - loss: 1.0709
Validation Accuracy: 0.7760288119316101
Validation Loss: 1.0881074666976929
