# DATA EXPLORATION

In [75]:
import pandas as pd  # For working with tables/data
import numpy as np   # For mathematical operations
import matplotlib.pyplot as plt  # For creating graphs
import seaborn as sns  # For beautiful visualizations

In [76]:
#defining paths for both benign and attack data 
benign_path = '../data/benign_data/benign_samples_1sec.csv'
attack_path = '../data/attack_data/attack_samples_1sec.csv'


In [77]:
attack_df = pd.read_csv(attack_path)
benign_df = pd.read_csv(benign_path)
# Add a label column to distinguish between benign and attack
benign_df['is_attack'] = 0  # 0 = benign
attack_df['is_attack'] = 1  # 1 = attack
# Basic exploration
print("Dataset shape:", attack_df.shape)
print("\nFirst few rows:")
attack_df.head()
benign_df.head()

Dataset shape: (90391, 95)

First few rows:


Unnamed: 0,device_name,device_mac,label_full,label1,label2,label3,label4,timestamp,timestamp_start,timestamp_end,...,network_time-delta_std_deviation,network_ttl_avg,network_ttl_max,network_ttl_min,network_ttl_std_deviation,network_window-size_avg,network_window-size_max,network_window-size_min,network_window-size_std_deviation,is_attack
0,router,28:87:ba:bd:c6:6c,benign_whole-network3,benign,benign,benign,benign,2025-09-09T14:09:40.400000Z_2025-09-09T14:09:4...,2025-09-09T14:09:40.400000Z,2025-09-09T14:09:41.400000Z,...,0.006059,62.8,64.0,61.0,1.469694,1870.5,3081.0,660.0,1210.5,0
1,router,28:87:ba:bd:c6:6c,benign_whole-network3,benign,benign,benign,benign,2025-09-09T14:09:41.400000Z_2025-09-09T14:09:4...,2025-09-09T14:09:41.400000Z,2025-09-09T14:09:42.400000Z,...,0.016469,62.5,64.0,61.0,1.5,1870.5,3081.0,660.0,1210.5,0
2,router,28:87:ba:bd:c6:6c,benign_whole-network3,benign,benign,benign,benign,2025-09-09T14:09:42.400000Z_2025-09-09T14:09:4...,2025-09-09T14:09:42.400000Z,2025-09-09T14:09:43.400000Z,...,0.034312,61.571429,64.0,53.0,3.736199,2441.285714,4736.0,135.0,1813.237335,0
3,router,28:87:ba:bd:c6:6c,benign_whole-network3,benign,benign,benign,benign,2025-09-09T14:09:43.400000Z_2025-09-09T14:09:4...,2025-09-09T14:09:43.400000Z,2025-09-09T14:09:44.400000Z,...,0.01279,62.5,64.0,61.0,1.5,1870.5,3081.0,660.0,1210.5,0
4,router,28:87:ba:bd:c6:6c,benign_whole-network3,benign,benign,benign,benign,2025-09-09T14:09:44.400000Z_2025-09-09T14:09:4...,2025-09-09T14:09:44.400000Z,2025-09-09T14:09:45.400000Z,...,0.017764,62.8,64.0,61.0,1.469694,2112.6,3081.0,660.0,1186.042933,0


In [78]:
# Combine both datasets
df = pd.concat([benign_df, attack_df], ignore_index=True)

# Shuffle the rows randomly
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [74]:
df.head(2)

Unnamed: 0,device_name,device_mac,label_full,label1,label2,label3,label4,timestamp,timestamp_start,timestamp_end,...,network_time-delta_std_deviation,network_ttl_avg,network_ttl_max,network_ttl_min,network_ttl_std_deviation,network_window-size_avg,network_window-size_max,network_window-size_min,network_window-size_std_deviation,is_attack
0,flame-sensor,08:b6:1f:82:ee:cc,attack_recon_host-disc-udp-ping_whole-network,attack,recon,host-disc-udp-ping,recon_host-disc-udp-ping,2025-01-15T13:23:21.055000Z_2025-01-15T13:23:2...,2025-01-15T13:23:21.055000Z,2025-01-15T13:23:22.055000Z,...,0.002282,149.714286,255.0,38.0,102.391007,3316.571429,5744.0,1024.0,2359.036538,1
1,plug-all-sensors,d4:a6:51:82:98:a8,benign_whole-network3,benign,benign,benign,benign,2025-09-09T14:47:54.400000Z_2025-09-09T14:47:5...,2025-09-09T14:47:54.400000Z,2025-09-09T14:47:55.400000Z,...,0.005997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [79]:
# Get detailed info about the dataset
print("📊 Dataset Information:")
print("="*50)
df.info()

📊 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227191 entries, 0 to 227190
Data columns (total 95 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   device_name                           227191 non-null  object 
 1   device_mac                            227191 non-null  object 
 2   label_full                            227191 non-null  object 
 3   label1                                227191 non-null  object 
 4   label2                                227191 non-null  object 
 5   label3                                227191 non-null  object 
 6   label4                                227191 non-null  object 
 7   timestamp                             227191 non-null  object 
 8   timestamp_start                       227191 non-null  object 
 9   timestamp_end                         227191 non-null  object 
 10  log_data-ranges_avg                   227191 

In [80]:
df.describe()

Unnamed: 0,log_data-ranges_avg,log_data-ranges_max,log_data-ranges_min,log_data-ranges_std_deviation,log_data-types_count,log_interval-messages,log_messages_count,network_fragmentation-score,network_fragmented-packets,network_header-length_avg,...,network_time-delta_std_deviation,network_ttl_avg,network_ttl_max,network_ttl_min,network_ttl_std_deviation,network_window-size_avg,network_window-size_max,network_window-size_min,network_window-size_std_deviation,is_attack
count,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,...,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0,227191.0
mean,55.375012,56.110518,53.880292,0.672938,0.260926,1.452949,0.534097,0.398783,250.628722,11.704751,...,0.005553,72.026003,117.337474,43.074308,30.16164,9747.344636,22039.686062,1369.114252,8469.102742,0.397863
std,206.965792,208.576679,202.493839,14.058134,0.53599,25.622204,1.804779,2.024332,1279.984971,9.855393,...,0.012187,75.24013,118.495225,55.097936,42.914745,15490.955824,29966.94342,3591.103767,12674.47961,0.489458
min,0.0,0.0,-0.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,...,0.0001,64.0,64.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,...,0.005083,150.857143,255.0,64.0,93.964042,20224.919072,64062.0,1024.0,25533.022578,1.0
max,2000.0,2000.0,2000.0,574.959575,2.0,1000.0,191.0,10.79,6959.0,24.0,...,0.22802,255.0,255.0,255.0,127.0,65535.0,65535.0,65535.0,32767.5,1.0


In [81]:
# Check if we have label columns
print("🎯 Analyzing Target Variables (Labels):")
print("="*50)

# Simple binary label (attack vs benign)
print("\n1. Binary Classification (is_attack):")
print(df['is_attack'].value_counts())
print(f"\nPercentage distribution:")
print(df['is_attack'].value_counts(normalize=True) * 100)

# If label2 exists (attack categories like dos, ddos)
if 'label2' in df.columns:
    print("\n\n2. Attack Categories (label2):")
    print(df['label2'].value_counts())
    
# If label3 exists (specific attack types)
if 'label3' in df.columns:
    print("\n\n3. Specific Attack Types (label3):")
    print(df['label3'].value_counts())

# If label4 exists (full attack scenario names)
if 'label4' in df.columns:
    print("\n\n4. Full Attack Scenarios (label4):")
    print(df['label4'].value_counts())

🎯 Analyzing Target Variables (Labels):

1. Binary Classification (is_attack):
is_attack
0    136800
1     90391
Name: count, dtype: int64

Percentage distribution:
is_attack
0    60.213653
1    39.786347
Name: proportion, dtype: float64


2. Attack Categories (label2):
label2
benign        136800
recon          33648
dos            18420
ddos           18056
mitm            8062
malware         7541
web             2796
bruteforce      1868
Name: count, dtype: int64


3. Specific Attack Types (label3):
label3
benign                    136800
arp-spoofing                4196
mirai-udp-flood             4010
os-scan                     3968
host-disc-tcp-ack-ping      3947
                           ...  
tcp-flood-port-22             62
http-flood-port-554           61
http-flood-port-9595          61
http-flood-port-443           61
syn-flood-port-557            31
Name: count, Length: 61, dtype: int64


4. Full Attack Scenarios (label4):
label4
benign                          136800
m

In [82]:
# Create a summary report
summary = f"""
{'='*60}
DATASET SUMMARY REPORT - STEP 1
{'='*60}

1. DATASET SIZE:
   - Total samples: {len(df):,}
   - Total features: {df.shape[1]}
   - Benign samples: {len(benign_df):,}
   - Attack samples: {len(attack_df):,}


3. MISSING VALUES:
   - Columns with missing values: {len(missing_df) if len(missing_df) > 0 else 0}

4. DATA TYPES:
   - Numerical columns: {len(df.select_dtypes(include=[np.number]).columns)}
   - Categorical columns: {len(df.select_dtypes(include=['object']).columns)}

5. TARGET DISTRIBUTION:
   - Benign: {(df['is_attack']==0).sum():,} ({(df['is_attack']==0).sum()/len(df)*100:.2f}%)
   - Attack: {(df['is_attack']==1).sum():,} ({(df['is_attack']==1).sum()/len(df)*100:.2f}%)

{'='*60}
"""

print(summary)

# Save to file
with open('step1_summary.txt', 'w') as f:
    f.write(summary)
print("\n✅ Summary saved to 'step1_summary.txt'")

NameError: name 'missing_df' is not defined