In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.utils import shuffle
from hashlib import md5
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
df = pd.read_csv("~\\kaggle\\input\\edgeiiotset-cyber-security-dataset-of-iot-iiot\\Edge-IIoTset dataset\Selected dataset for ML and DL\\DNN-EdgeIIoT-dataset.csv", low_memory = False)

In [None]:
df = df.rename(columns={'http.request.method': 'http1', 'http.referer': 'http2', 'http.request.version': 'http3', 'dns.qry.name.len': 'dns', 'mqtt.conack.flags': 'mqtt1', 'mqtt.protoname': 'mqtt2', 'mqtt.topic': 'mqtt3'})

In [None]:
print(df.columns.tolist())

In [None]:
# Layer 7 protocols
layer_7_columns = ['http1', 'http2', 'http3', 'dns', 'mqtt1', 'mqtt2', 'mqtt3']

# Filter rows for application-layer data
df_layer_7 = df[df[layer_7_columns].notna().any(axis=1)]

# Layer 7 attacks
layer_7_attacks = ['SQL_injection', 'Password', 'DDoS_HTTP', 'Uploading', 'Backdoor', 'XSS', 'Ransomware', 'MITM', 'Normal']

# Filter by attack type
df_layer_7 = df_layer_7[df_layer_7["Attack_type"].isin(layer_7_attacks)]

# Shuffle and reset index
df_layer_7 = df_layer_7.reset_index(drop=True)
df_layer_7 = shuffle(df_layer_7)

df_layer_7.head()
print(df_layer_7['Attack_type'].value_counts())


In [None]:
le_http1 = LabelEncoder()
le_http2 = LabelEncoder()
le_http3 = LabelEncoder()
le_dns = LabelEncoder()
le_mqtt1 = LabelEncoder()
le_mqtt2 = LabelEncoder()
le_mqtt3 = LabelEncoder()

In [None]:
df_layer_7['http1_encoded'] = le_http1.fit_transform(df_layer_7['http1'])
df_layer_7['http2_encoded'] = le_http2.fit_transform(df_layer_7['http2'])
df_layer_7['http3_encoded'] = le_http3.fit_transform(df_layer_7['http3'])
df_layer_7['dns_encoded'] = le_dns.fit_transform(df_layer_7['dns'])
df_layer_7['mqtt1_encoded'] = le_mqtt1.fit_transform(df_layer_7['mqtt1'])
df_layer_7['mqtt2_encoded'] = le_mqtt2.fit_transform(df_layer_7['mqtt2'])
df_layer_7['mqtt3_encoded'] = le_mqtt3.fit_transform(df_layer_7['mqtt3'])

In [None]:
http1_ = OneHotEncoder()
http2_ = OneHotEncoder()
http3_ = OneHotEncoder()
dns_ = OneHotEncoder()
mqtt1_ = OneHotEncoder()
mqtt2_ = OneHotEncoder()
mqtt3_ = OneHotEncoder()

In [None]:
X1 = http1_.fit_transform(df_layer_7.http1_encoded.values.reshape(-1,1)).toarray()
X2 = http2_.fit_transform(df_layer_7.http2_encoded.values.reshape(-1,1)).toarray()
X3 = http3_.fit_transform(df_layer_7.http3_encoded.values.reshape(-1,1)).toarray()
X4 = dns_.fit_transform(df_layer_7.dns_encoded.values.reshape(-1,1)).toarray()
X5 = mqtt1_.fit_transform(df_layer_7.mqtt1_encoded.values.reshape(-1,1)).toarray()
X6 = mqtt2_.fit_transform(df_layer_7.mqtt2_encoded.values.reshape(-1,1)).toarray()
X7 = mqtt3_.fit_transform(df_layer_7.mqtt3_encoded.values.reshape(-1,1)).toarray()

In [None]:
dfOneHot = pd.DataFrame(X1, columns = ["http1_"+str(int(i)) for i in range(X1.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X2, columns = ["http2_"+str(int(i)) for i in range(X2.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X3, columns = ["http3_"+str(int(i)) for i in range(X3.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X4, columns = ["dns_"+str(int(i)) for i in range(X4.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X5, columns = ["mqtt1_"+str(int(i)) for i in range(X5.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X6, columns = ["mqtt2_"+str(int(i)) for i in range(X6.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)
dfOneHot = pd.DataFrame(X7, columns = ["mqtt3_"+str(int(i)) for i in range(X7.shape[1])])
df_layer_7 = pd.concat([df_layer_7, dfOneHot], axis=1)

In [None]:
df_layer_7.drop(columns=['http1','http2','http3', 'dns','mqtt1','mqtt2', 'mqtt3'], inplace = True )
df_layer_7.shape

In [None]:
df_layer_7.isnull().sum()

In [None]:
df_layer_7.duplicated().sum()

In [None]:
df_layer_7 = df_layer_7.drop_duplicates()

In [None]:
# Function to create a hash for each column
def hash_column(series):
    return md5(pd.util.hash_pandas_object(series, index=False).values).hexdigest()

# Function to find columns with identical hashes
def find_identical_columns_by_hash(df):
    hash_dict = {}
    for col in df_layer_7.columns:
        col_hash = hash_column(df_layer_7[col])
        if col_hash in hash_dict:
            hash_dict[col_hash].append(col)
        else:
            hash_dict[col_hash] = [col]

    return [cols for cols in hash_dict.values() if len(cols) > 1]

# Applying the function to the DataFrame
identical_column_groups = find_identical_columns_by_hash(df_layer_7)
print("Groups of identical columns:", identical_column_groups)

In [None]:
# Groups of identical columns
identical_column_groups = [
    ['icmp.unused', 'http.tls_port', 'dns.qry.type', 'mqtt.msg_decoded_as'],
    ['mqtt.conflag.cleansess', 'mqtt2_2'], 
    ['mqtt.proto_len', 'mqtt.ver'], 
    ['mqtt1_1', 'mqtt2_1', 'mqtt3_1']
]

# Iterate through the list of groups and drop all but the first column
for group in identical_column_groups:
    # Keep the first column of the group and drop the rest
    columns_to_drop = group[1:]  # all columns except the first one
    df_layer_7 = df_layer_7.drop(columns_to_drop, axis=1)

In [None]:
df_layer_7.info()

In [None]:
print(df_layer_7['Attack_type'].value_counts())

In [None]:
df_cat = df_layer_7.select_dtypes(exclude=[np.number])
df_cat.describe(include='all')

In [None]:
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.dst.proto_ipv4", "arp.src.proto_ipv4",
                "http.file_data", "http.request.uri.query",
                "http.request.full_uri", "tcp.options", "tcp.payload",
                "tcp.srcport", "mqtt.msg"]
df_layer_7.drop(drop_columns, axis = 1, inplace = True)
df_layer_7.dropna(axis = 0, how = 'any', inplace = True)
df_layer_7.drop_duplicates(subset = None, keep = "first", inplace = True)
df_layer_7 = shuffle(df_layer_7)

In [None]:
df_layer_7.info()

In [None]:
print(df_layer_7['Attack_type'].value_counts())

In [None]:
columns_to_drop = ['icmp.unused']
df_layer_7 = df_layer_7.drop(columns=columns_to_drop)

In [None]:
df_layer_7.info()

In [None]:
print(df_layer_7['Attack_type'].value_counts())

In [None]:
# Your DataFrame is named 'data' in this example
df_layer_7.to_csv('Edge-IIoTset_112.csv', index=False)