In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from category_encoders import TargetEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
#################################################################################
# Read in Data
df = pd.read_pickle('final_df.pkl')

#################################################################################
# Convert all 0.0 to 0 for all columns
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].replace({'0.0': '0', '0': '0'})

#################################################################################
# Drop constant columns
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
df.drop(columns=constant_columns, inplace=True)
print(f"Dropped constant columns: {constant_columns}")

#################################################################################
# Drop high correlation columns
high_corr_columns = ['tcp.payload']
df.drop(columns=high_corr_columns, inplace=True)
print(f"Dropped high correlation columns: {high_corr_columns}")

#################################################################################
### Drop Low information column 
low_info_columns = ['arp.dst.proto_ipv4',
                    'arp.src.proto_ipv4',
                    'mqtt.conack.flags',
                    'mqtt.conflag.cleansess',
                    'mqtt.conflags', 
                    'mqtt.hdrflags',
                    'mqtt.len',
                    'mqtt.msg',
                    'mqtt.msgtype',
                    'mqtt.proto_len',
                    'mqtt.protoname', 
                    'mqtt.topic',
                    'mqtt.topic_len',
                    'mqtt.ver']
df.drop(columns=low_info_columns, inplace=True)
print(f"Dropped low information columns: {low_info_columns}")
#################################################################################
# Drop Inconsistent Data columns
inconsistent_columns = ['http.referer',
                        'http.request.full_uri',
                        'icmp.transmit_timestamp']

df.drop(columns=inconsistent_columns, inplace=True)
print(f"Dropped inconsistent columns: {inconsistent_columns}")
#################################################################################
# Drop Outliers
df = df[df['tcp.seq'] <= 2500000]
df = df[df['http.content_length'] <= 2000]

#################################################################################
# Convert tcp.options to length
bool_col = ['tcp.connection.fin',
            'tcp.connection.rst',
            'tcp.connection.syn',
            'tcp.connection.synack',
            'tcp.flags.ack']
df[bool_col] = df[bool_col].astype(int)

#################################################################################
# Convert tcp.options to length
df['tcp.options'] = df['tcp.options'].apply(lambda x: len(x) if isinstance(x, str) else 0)

#################################################################################
# Convert frame.time to Datetime and create  1s Bins  
df['frame.time'] = df['frame.time'].apply(lambda t: t[6:21])
df['frame.time'] = pd.to_datetime(df['frame.time'], format='%H:%M:%S.%f')
df['frame.time'] = df['frame.time'].dt.round('S')

#################################################################################
# Change Attack_types to Integers
type_mapping = {
    'Normal': 12,  # Assuming 12 is the label for Normal
    'Backdoor': 0,
    'DDoS_HTTP': 1,
    'DDoS_ICMP': 2,
    'DDoS_TCP': 3,
    'OS_Fingerprinting': 4,
    'Password': 5,
    'Port_Scanning': 6,
    'Ransomware': 7,
    'SQL_injection': 8,
    'Uploading': 9,
    'Vulnerability_scanner': 10,
    'XSS': 11}

df['Attack_type'] = df['Attack_type'].replace(type_mapping)

#################################################################################
# Split Dataset into train and test
X = df.drop(columns=['Attack_type', 'Attack_label'])
y = pd.DataFrame(df['Attack_type'])
y_label = df['Attack_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=8)

#################################################################################
# Target Encoding
col_to_target_encode = ['ip.src_host', 
                         'ip.dst_host', 
                         'tcp.dstport', 
                         'tcp.srcport']

te = TargetEncoder(cols=col_to_target_encode)
X_train[col_to_target_encode] = te.fit_transform(X_train[col_to_target_encode], y_train)
X_test[col_to_target_encode] = te.transform(X_test[col_to_target_encode])
#################################################################################
# Normalisation 
bool_col = ['tcp.connection.fin',
            'tcp.connection.rst',
            'tcp.connection.syn',
            'tcp.connection.synack',
            'tcp.flags.ack']
cols_ohe = ['arp.opcode',
            'arp.hw.size',
            'http.request.method',
            'http.request.version',
            'tcp.flags',
            'tcp.options']

cols_non_numeric = ['frame.time',
                    'http.request.uri.query',
                    'http.file_data']
cols_to_not_normalize = bool_col + cols_non_numeric + cols_ohe
cols_to_normalize = [col for col in X_train.columns if col not in cols_to_not_normalize]

scaler = StandardScaler()
X_train[cols_to_normalize] = scaler.fit_transform(X_train[cols_to_normalize])
X_test[cols_to_normalize] = scaler.transform(X_test[cols_to_normalize])

#################################################################################
# One Hot Encoding
ohe = OneHotEncoder(sparse_output=False)
X_train_encoded = ohe.fit_transform(X_train[cols_ohe])
X_test_encoded = ohe.transform(X_test[cols_ohe])

# Convert the encoded arrays to DataFrames with appropriate column names
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names_out(cols_ohe), index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=ohe.get_feature_names_out(cols_ohe), index=X_test.index)

# Drop the original columns and concatenate the new encoded columns
X_train = X_train.drop(columns=cols_ohe).join(X_train_encoded_df)
X_test = X_test.drop(columns=cols_ohe).join(X_test_encoded_df)

Dropped constant columns: ['icmp.unused', 'http.tls_port', 'udp.port', 'udp.stream', 'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
Dropped high correlation columns: ['tcp.payload']
Dropped low information columns: ['arp.dst.proto_ipv4', 'arp.src.proto_ipv4', 'mqtt.conack.flags', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len', 'mqtt.ver']
Dropped inconsistent columns: ['http.referer', 'http.request.full_uri', 'icmp.transmit_timestamp']


  df['Attack_type'] = df['Attack_type'].replace(type_mapping)


In [11]:
#################################################################################
# Tokenisation
cols_to_tokenize = ['http.request.uri.query', 'http.file_data']

# Concatenate the columns to tokenize into a single string for each row
def concatenate_columns(row):
    col1 = 'http.request.uri.query'
    col2 = 'http.file_data'
    if row[col1] != '0' and row[col2] != '0':
        return row[col1] + row[col2]
    elif row[col1] == '0' and row[col2] == '0':
        return '0'
    else:
        return row[col1] if row[col1] != '0' else row[col2]

# Apply the function row-wise
tfidf_vectorizer = TfidfVectorizer(min_df = 0.01)

df_concatenated_train = X_train[cols_to_tokenize].apply(concatenate_columns, axis=1)
df_concatenated_test = X_test[cols_to_tokenize].apply(concatenate_columns, axis=1)

normal_features_train = X_train.drop(columns=cols_to_tokenize)
normal_features_test = X_test.drop(columns=cols_to_tokenize)

tfidf_matrix_train = tfidf_vectorizer.fit_transform(df_concatenated_train)
tfidf_matrix_test = tfidf_vectorizer.transform(df_concatenated_test)

tfidf_df_train = pd.DataFrame(tfidf_matrix_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df_test = pd.DataFrame(tfidf_matrix_test.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Join the tfidf features back with the normal features
X_train_combined = pd.concat([normal_features_train.reset_index(drop=True), tfidf_df_train.reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([normal_features_test.reset_index(drop=True), tfidf_df_test.reset_index(drop=True)], axis=1)


X_train = X_train_combined
X_test = X_test_combined
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Download the preprocessed file

In [12]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')
y_train.to_pickle('y_train.pkl')
y_test.to_pickle('y_test.pkl')