In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/NF-UQ-NIDS-V2-Sample.csv
/kaggle/input/NF-UQ-NIDS-V2_Sample-2.csv


In [2]:
df = pd.read_csv('/kaggle/input/NF-UQ-NIDS-V2_Sample-2.csv')

In [3]:
df.shape[0]

3799399

In [4]:
np.unique(df['Attack'])

array(['Analysis', 'Backdoor', 'Benign', 'Bot', 'Brute Force', 'DDoS',
       'DoS', 'Exploits', 'Fuzzers', 'Generic', 'Infilteration',
       'Reconnaissance', 'Shellcode', 'Theft', 'Worms', 'injection',
       'mitm', 'password', 'ransomware', 'scanning', 'xss'], dtype=object)

In [5]:
from sklearn.preprocessing import LabelEncoder
LE1 = LabelEncoder()

In [6]:
def processX(X):
    #ip addresses are in format xxx.xxx.xxx.xxx which is not compatible with the models
    srcaddr= LE1.fit_transform(X['IPV4_SRC_ADDR'])
    X['IPV4_SRC_ADDR'] = pd.Series(srcaddr)
    dstaddr= LE1.fit_transform(X['IPV4_DST_ADDR'])
    X['IPV4_DST_ADDR'] = pd.Series(dstaddr)
    

    from sklearn.impute import SimpleImputer 
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    
    X_object = X.select_dtypes("object")
    X_int = X.select_dtypes(exclude = "object")
    X_columns = X_int.columns #this stores column names 

    imputer.fit(X_int)
    X_int = imputer.transform(X_int)
    X_int = pd.DataFrame(X_int)
    
    X_int.columns = X_columns #corrects column name
    
    X_object = X_object.fillna("unknown")
    X = X_object.join(X_int)
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    scaler = MinMaxScaler()
    # store the columns
    cols = X.columns
    
    # transform the data
    X_scaled = scaler.fit_transform(X)
    
    X_scaled = pd.DataFrame(X_scaled, columns = cols)
     
    
    return X_scaled

In [7]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

In [8]:
X=df.drop(['Attack','Dataset','Label','Attack_Class'],axis=1)
y=df['Attack']
X=processX(X)

In [9]:

# Assuming 'y_train' and 'y_test' are your original target labels
target_classes = ['Benign', 'DDoS', 'DoS', 'Brute Force','injection'] 

# Filter the data
X = X[np.isin(y, target_classes)]
y = y[np.isin(y, target_classes)]


In [10]:
multilbl = LE.fit_transform(y)
y = pd.Series(multilbl)
# Access the original classes
classes = LE.classes_
# Print the mapping
for i, cls in enumerate(classes):
    print(f"{i}: {cls}")

0: Benign
1: Brute Force
2: DDoS
3: DoS
4: injection


In [11]:
from tensorflow.keras.utils import to_categorical
y_cat = to_categorical(y)

In [12]:
from sklearn.model_selection import train_test_split
x_train_dl, x_test_dl, y_train_dl, y_test_dl = train_test_split(X, y, test_size=0.2,random_state=55)
x_train_dl, x_val_dl, y_train_dl, y_val_dl = train_test_split(x_train_dl, y_train_dl, test_size=0.25,random_state=55)

In [13]:
# Assuming you've identified unique attack types
unique_attacks = sorted(list(set(y)))

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [15]:
from sklearn.feature_selection import RFE

In [16]:
# # Feature Selection for each attack type (OvR)
# for i in range(0,3):
    
#     # Create OvR target variable
#     y_train_ovr = (y_train_dl == i).astype(int)
#     y_test_ovr = (y_test_dl == i).astype(int)
#     y_val_ovr = (y_val_dl == i).astype(int)
    
#     # Create and train the OvR model
#     print("Feature Importance for",i)
#     estimator = RandomForestClassifier()
#     selector = RFE(estimator,n_features_to_select=15,verbose=1)
#     selector.fit(x_train_dl, y_train_ovr)
    
#     print("Selcting features")
#     X_train_selected = selector.transform(x_train_dl)
#     X_test_selected = selector.transform(x_test_dl)
#     X_val_selected = selector.transform(x_val_dl)

#     print("Seletected Features: ",selector.get_feature_names_out(input_features=None))
#     print(X_train_selected)

In [17]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.linear_model import LinearRegression

In [18]:
# Feature Selection for each attack type (OvR)
for i in range(0,5):
    
    # Create OvR target variable
    y_train_ovr = (y_train_dl == i).astype(int)
    y_test_ovr = (y_test_dl == i).astype(int)
    y_val_ovr = (y_val_dl == i).astype(int)
    
    # Create and train the OvR model
    print("Feature Importance for",classes[i])
    lreg=LinearRegression()
    selector=SFS(lreg,k_features=15,forward=True,verbose=1,scoring="neg_mean_squared_error")
    selector.fit(x_train_dl, y_train_ovr)

    selected_features=list(selector.k_feature_names_)
    print("Selcting features")
    X_train_selected = x_train_dl[selected_features]
    X_test_selected = x_test_dl[selected_features]
    X_val_selected = x_val_dl[selected_features]

    print("Seletected Features: ",selected_features)
    print(X_train_selected)

Feature Importance for Benign


Features: 15/15

Selcting features
Seletected Features:  ['IPV4_SRC_ADDR', 'L4_DST_PORT', 'L7_PROTO', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'MAX_TTL', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'RETRANSMITTED_OUT_PKTS', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT', 'ICMP_TYPE', 'FTP_COMMAND_RET_CODE']
         IPV4_SRC_ADDR  L4_DST_PORT  L7_PROTO  TCP_FLAGS  CLIENT_TCP_FLAGS  \
179087        0.538815     0.001221  0.028226   0.008969          0.008969   
285799        0.538860     0.001221  0.758065   0.000000          0.000000   
3016282       0.538815     0.001221  0.758065   0.000000          0.000000   
2875841       0.538838     0.001221  0.028226   0.008969          0.008969   
615171        0.538472     0.030671  0.000000   0.098655          0.008969   
...                ...          ...       ...        ...               ...   
3356501       0.285891     0.000809  0.000000   0.000000          0.000000   
912430        0.538838     0.001221  0.758065   0.000000       

Features: 15/15

Selcting features
Seletected Features:  ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'NUM_PKTS_128_TO_256_BYTES', 'TCP_WIN_MAX_IN', 'DNS_QUERY_ID']
         IPV4_SRC_ADDR  IPV4_DST_ADDR  L4_DST_PORT  PROTOCOL  L7_PROTO  \
179087        0.538815       0.274445     0.001221  0.023529  0.028226   
285799        0.538860       0.274445     0.001221  0.066667  0.758065   
3016282       0.538815       0.274746     0.001221  0.066667  0.758065   
2875841       0.538838       0.274746     0.001221  0.023529  0.028226   
615171        0.538472       0.272036     0.030671  0.023529  0.000000   
...                ...            ...          ...       ...       ...   
3356501       0.285891       0.199925     0.000809  0.066667  0.000000   
912430        0.538838       0.274445     0.001221  0.066667  0.758065   
3619840       0.5

Features: 15/15

Selcting features
Seletected Features:  ['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'RETRANSMITTED_OUT_BYTES', 'TCP_WIN_MAX_IN', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID']
         IPV4_SRC_ADDR  L4_SRC_PORT  L4_DST_PORT  PROTOCOL  L7_PROTO  \
179087        0.538815     0.744564     0.001221  0.023529  0.028226   
285799        0.538860     0.290349     0.001221  0.066667  0.758065   
3016282       0.538815     0.948531     0.001221  0.066667  0.758065   
2875841       0.538838     0.457389     0.001221  0.023529  0.028226   
615171        0.538472     0.794827     0.030671  0.023529  0.000000   
...                ...          ...          ...       ...       ...   
3356501       0.285891     0.796612     0.000809  0.066667  0.000000   
912430        0.538838     0.019806     0.001221  0.066667  0.758065   
3619840       0.538792     0.019928     0.0012

Features: 15/15

Selcting features
Seletected Features:  ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_PKTS', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'TCP_WIN_MAX_IN', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'FTP_COMMAND_RET_CODE']
         L4_SRC_PORT  L4_DST_PORT  PROTOCOL  L7_PROTO   IN_PKTS  TCP_FLAGS  \
179087      0.744564     0.001221  0.023529  0.028226  0.000003   0.008969   
285799      0.290349     0.001221  0.066667  0.758065  0.000003   0.000000   
3016282     0.948531     0.001221  0.066667  0.758065  0.000003   0.000000   
2875841     0.457389     0.001221  0.023529  0.028226  0.000000   0.008969   
615171      0.794827     0.030671  0.023529  0.000000  0.000000   0.098655   
...              ...          ...       ...       ...       ...        ...   
3356501     0.796612     0.000809  0.066667  0.000000  0.000000   0.000000   
912430      0.019806     0.001221  0.066667  0.758065  0.000005   0.000000   

Features: 15/15

Selcting features
Seletected Features:  ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_DST_PORT', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'MAX_TTL', 'LONGEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS', 'SRC_TO_DST_AVG_THROUGHPUT', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT', 'ICMP_TYPE', 'DNS_QUERY_ID']
         IPV4_SRC_ADDR  IPV4_DST_ADDR  L4_DST_PORT  TCP_FLAGS  \
179087        0.538815       0.274445     0.001221   0.008969   
285799        0.538860       0.274445     0.001221   0.000000   
3016282       0.538815       0.274746     0.001221   0.000000   
2875841       0.538838       0.274746     0.001221   0.008969   
615171        0.538472       0.272036     0.030671   0.098655   
...                ...            ...          ...        ...   
3356501       0.285891       0.199925     0.000809   0.000000   
912430        0.538838       0.274445     0.001221   0.000000   
3619840       0.538792       0.274445     0.001221   0.098655   
3603017       0.289066       0