In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency

# Define dtypes for all known problematic columns
dtypes = {
    'frame.time': 'object',
    'ip.src_host': 'object',
    'ip.dst_host': 'object',
    'arp.dst.proto_ipv4': 'object',
    'arp.src.proto_ipv4': 'object',
    'icmp.checksum': 'object',
    'tcp.checksum': 'object',
    'tcp.flags': 'object',
    'tcp.flags.ack': 'object',
    'tcp.options': 'object',
    'tcp.payload': 'object',
    'http.request.method': 'object',
    'http.request.full_uri': 'object',
    'http.referer': 'object',
    'http.request.version': 'object',
    'http.host': 'object',
    'dns.qry.name': 'object',
    'mqtt.conack.flags': 'object',
    'mqtt.msg': 'object',
    'mqtt.protoname': 'object',
    'mqtt.topic': 'object',
    'Attack_type': 'object',
    'Attack_label': 'float32'
}

# Load datasets and assign Attack_type
df1 = pd.read_csv('./createddata/attack.csv', dtype=dtypes, low_memory=False)
df1['Attack_type'] = 'DDoS_scripted'
df1['Attack_label'] = 1.0
df2 = pd.read_csv('./createddata/normal.csv', dtype=dtypes, low_memory=False)
df2['Attack_type'] = 'Normal'
df2['Attack_label'] = 0.0
df3 = pd.read_csv('./createddata/EdgeIIoT_99K_subset.csv', dtype=dtypes, low_memory=False)

# Concatenate
df = pd.concat([df1, df2, df3], ignore_index=True)
print("Shape:", df.shape)

# Drop rows with NaN in Attack_label
df = df.dropna(subset=['Attack_label'])
print("Shape after dropping NaN Attack_label:", df.shape)

# Inspect frame.time for invalid values
print("\nUnique frame.time values (sample):")
print(df['frame.time'].head(10))
print("\nNull or empty frame.time values:")
print(df['frame.time'].isnull().sum())
print(df['frame.time'].str.strip().eq('').sum())

# Convert timestamp with stricter error handling
df['frame.time'] = pd.to_datetime(df['frame.time'], errors='coerce', format='%b %d, %Y %H:%M:%S.%f %Z')
df['hour'] = df['frame.time'].dt.hour.fillna(0).astype('int32')
df['minute'] = df['frame.time'].dt.minute.fillna(0).astype('int32')
df['second'] = df['frame.time'].dt.second.fillna(0).astype('int32')
df = df.drop('frame.time', axis=1)

# Handle object columns
# arp.dst.proto_ipv4, arp.src.proto_ipv4: Convert to binary (0 for '0', 1 for IP)
df['arp.dst.proto_ipv4'] = df['arp.dst.proto_ipv4'].apply(lambda x: 0 if x == '0' else 1 if isinstance(x, str) else 0).astype('int32')
df['arp.src.proto_ipv4'] = df['arp.src.proto_ipv4'].apply(lambda x: 0 if x == '0' else 1 if isinstance(x, str) else 0).astype('int32')

# tcp.flags.ack: Convert to binary (0 or 1)
if 'tcp.flags.ack' in df.columns:
    df['tcp.flags.ack'] = df['tcp.flags.ack'].apply(lambda x: 1 if x == '1' else 0 if isinstance(x, str) else 0).astype('int32')

# http.request.method: Handle missing values before converting to category
if 'http.request.method' in df.columns:
    if df['http.request.method'].notnull().sum() / len(df) < 0.1:
        print("Dropping http.request.method due to sparsity")
        df = df.drop('http.request.method', axis=1)
    else:
        # Fill NaN with 'unknown' to avoid invalid category
        df['http.request.method'] = df['http.request.method'].fillna('unknown')
        top_methods = df['http.request.method'].value_counts().head(5).index
        df['http.request.method'] = df['http.request.method'].apply(lambda x: x if x in top_methods else 'other').astype('category')
        print("Value counts for http.request.method:")
        print(df['http.request.method'].value_counts())

# Drop complex object columns
drop_cols = ['icmp.checksum', 'tcp.checksum', 'tcp.flags', 'tcp.options', 'tcp.payload', 
             'http.request.full_uri', 'http.referer', 'http.request.version', 'http.host', 
             'dns.qry.name', 'mqtt.conack.flags', 'mqtt.msg', 'mqtt.protoname', 'mqtt.topic']
df = df.drop([col for col in drop_cols if col in df.columns], axis=1)

# Handle IP addresses: Convert to segments
df['ip.src_host_segment'] = df['ip.src_host'].apply(lambda x: '.'.join(x.split('.')[:2]) if isinstance(x, str) else 'unknown')
df['ip.dst_host_segment'] = df['ip.dst_host'].apply(lambda x: '.'.join(x.split('.')[:2]) if isinstance(x, str) else 'unknown')
df = df.drop(['ip.src_host', 'ip.dst_host'], axis=1)

# Convert float64 to float32 for memory efficiency
for col in df.columns:
    if df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')

# Fill missing numerical values with 0
# Handle categorical columns separately to avoid TypeError
for col in df.columns:
    if df[col].dtype.name == 'category':
        # Add 'unknown' to categories if not present
        if 'unknown' not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories(['unknown'])
        df[col] = df[col].fillna('unknown')
    else:
        df[col] = df[col].fillna(0)

# Print remaining object/category columns
object_cols = df.select_dtypes(include=['object', 'category']).columns
print("Remaining object/category columns:", object_cols)

# Display basic info to confirm
print("\nFinal Dataset Info:")
print(df.info())
print("\nFinal Missing Values:")
print(df.isnull().sum())

Shape: (149000, 63)
Shape after dropping NaN Attack_label: (149000, 63)

Unique frame.time values (sample):
0    Jul 21, 2025 13:01:45.445292000 CDT
1    Jul 21, 2025 13:01:45.445453000 CDT
2    Jul 21, 2025 13:01:45.512349000 CDT
3    Jul 21, 2025 13:01:45.512483000 CDT
4    Jul 21, 2025 13:01:45.523326000 CDT
5    Jul 21, 2025 13:01:45.523428000 CDT
6    Jul 21, 2025 13:01:45.524954000 CDT
7    Jul 21, 2025 13:01:45.525036000 CDT
8    Jul 21, 2025 13:01:45.528167000 CDT
9    Jul 21, 2025 13:01:45.528292000 CDT
Name: frame.time, dtype: object

Null or empty frame.time values:
0
0
Value counts for http.request.method:
http.request.method
0.0        99000
unknown    49901
NOTIFY        92
GET            7
Name: count, dtype: int64
Remaining object/category columns: Index(['http.request.method', 'Attack_type', 'ip.src_host_segment',
       'ip.dst_host_segment'],
      dtype='object')

Final Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149000 entries, 0 to 148999
Data 

In [46]:
# Handle object columns
# arp.dst.proto_ipv4, arp.src.proto_ipv4: Convert to binary (0 for '0', 1 for IP)
df['arp.dst.proto_ipv4'] = df['arp.dst.proto_ipv4'].apply(lambda x: 0 if x == '0' else 1 if isinstance(x, str) else 0).astype('int32')
df['arp.src.proto_ipv4'] = df['arp.src.proto_ipv4'].apply(lambda x: 0 if x == '0' else 1 if isinstance(x, str) else 0).astype('int32')

# tcp.flags.ack: Convert to binary (0 or 1)
if 'tcp.flags.ack' in df.columns:
    df['tcp.flags.ack'] = df['tcp.flags.ack'].apply(lambda x: 1 if x == '1' else 0 if isinstance(x, str) else 0).astype('int32')

# http.request.method: Keep top 5 methods or drop if sparse
if 'http.request.method' in df.columns:
    if df['http.request.method'].notnull().sum() / len(df) < 0.1:
        df = df.drop('http.request.method', axis=1)
    else:
        top_methods = df['http.request.method'].value_counts().head(5).index
        df['http.request.method'] = df['http.request.method'].apply(lambda x: x if x in top_methods else 'other').astype('category')

# Drop complex object columns
drop_cols = ['icmp.checksum', 'tcp.checksum', 'tcp.flags', 'tcp.options', 'tcp.payload', 
             'http.request.full_uri', 'http.referer', 'http.request.version', 'http.host', 
             'dns.qry.name', 'mqtt.conack.flags', 'mqtt.msg', 'mqtt.protoname', 'mqtt.topic']
df = df.drop([col for col in drop_cols if col in df.columns], axis=1)

# Handle IP addresses: Convert to segments
df['ip.src_host_segment'] = df['ip.src_host'].apply(lambda x: '.'.join(x.split('.')[:2]) if isinstance(x, str) else 'unknown')
df['ip.dst_host_segment'] = df['ip.dst_host'].apply(lambda x: '.'.join(x.split('.')[:2]) if isinstance(x, str) else 'unknown')
df = df.drop(['ip.src_host', 'ip.dst_host'], axis=1)

# Convert float64 to float32
for col in df.columns:
    if df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')

# Fill missing numerical values with 0
df.fillna(0, inplace=True)

# Print remaining object/category columns
object_cols = df.select_dtypes(include=['object', 'category']).columns
print("Remaining object/category columns:", object_cols)

TypeError: Cannot setitem on a Categorical with a new category (0), set the categories first