# TON_IoT Dataset - Exploratory Data Analysis

Analysis of missing values and label distribution for the TON_IoT network traffic dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
sns.set_style('whitegrid')

## Load TON_IoT Dataset

In [None]:
# Define path and get CSV files
ton_path = '/home/sagemaker-user/data/TON_IOT'
ton_files = sorted(Path(ton_path).glob('*.csv'))

print(f"TON_IOT files: {len(ton_files)}")
for file in ton_files:
    print(f"  - {file.name}")

TON_IOT files: 23
  - Network_dataset_1.csv
  - Network_dataset_10.csv
  - Network_dataset_11.csv
  - Network_dataset_12.csv
  - Network_dataset_13.csv
  - Network_dataset_14.csv
  - Network_dataset_15.csv
  - Network_dataset_16.csv
  - Network_dataset_17.csv
  - Network_dataset_18.csv
  - Network_dataset_19.csv
  - Network_dataset_2.csv
  - Network_dataset_20.csv
  - Network_dataset_21.csv
  - Network_dataset_22.csv
  - Network_dataset_23.csv
  - Network_dataset_3.csv
  - Network_dataset_4.csv
  - Network_dataset_5.csv
  - Network_dataset_6.csv
  - Network_dataset_7.csv
  - Network_dataset_8.csv
  - Network_dataset_9.csv


: 

In [3]:
# Load and concatenate all files
print("Loading TON_IOT dataset...")

ton_dfs = []
for i, file in enumerate(ton_files, 1):
    print(f"  [{i}/{len(ton_files)}] {file.name}")
    df = pd.read_csv(file)
    ton_dfs.append(df)

ton_data = pd.concat(ton_dfs, ignore_index=True)
print(f"\nTotal shape: {ton_data.shape}")
print(f"Rows: {ton_data.shape[0]:,}")
print(f"Columns: {ton_data.shape[1]}")

Loading TON_IOT dataset...
  [1/23] Network_dataset_1.csv


  df = pd.read_csv(file)


  [2/23] Network_dataset_10.csv
  [3/23] Network_dataset_11.csv
  [4/23] Network_dataset_12.csv
  [5/23] Network_dataset_13.csv
  [6/23] Network_dataset_14.csv
  [7/23] Network_dataset_15.csv
  [8/23] Network_dataset_16.csv
  [9/23] Network_dataset_17.csv
  [10/23] Network_dataset_18.csv
  [11/23] Network_dataset_19.csv
  [12/23] Network_dataset_2.csv
  [13/23] Network_dataset_20.csv
  [14/23] Network_dataset_21.csv
  [15/23] Network_dataset_22.csv


  df = pd.read_csv(file)


  [16/23] Network_dataset_23.csv


  df = pd.read_csv(file)


  [17/23] Network_dataset_3.csv
  [18/23] Network_dataset_4.csv
  [19/23] Network_dataset_5.csv
  [20/23] Network_dataset_6.csv
  [21/23] Network_dataset_7.csv


: 

: 

## Missing Values Analysis

In [None]:
print("TON_IOT MISSING VALUES\n")

missing = ton_data.isnull().sum()
missing_pct = (missing / len(ton_data)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print(f"Total missing values: {missing.sum():,}")
print(f"Columns with missing values: {(missing > 0).sum()}")
print(f"Total columns: {len(missing)}")

if missing.sum() > 0:
    print("\nColumns with missing values:")
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("\nNo missing values found!")

## Label Distribution Analysis

**Label Structure**: Binary label (Normal vs Attack) + Attack type categories (DoS, DDoS, Backdoor, Injection, Reconnaissance, Scanning, XSS, Password, Ransomware, MITM)

In [None]:
print("TON_IOT LABEL DISTRIBUTION\n")

# Check for label and type columns
label_col_binary = None
type_col = None

for col in ton_data.columns:
    col_lower = col.lower().strip()
    if col_lower == 'label':
        label_col_binary = col
    elif col_lower == 'type':
        type_col = col

# Binary label (if exists)
if label_col_binary:
    print(f"1. Binary Label: '{label_col_binary}'")
    binary_dist = ton_data[label_col_binary].value_counts()
    binary_pct = (binary_dist / len(ton_data) * 100)
    
    binary_df = pd.DataFrame({
        'Count': binary_dist,
        'Percentage': binary_pct.round(2)
    })
    display(binary_df)
    print(f"\nTotal samples: {len(ton_data):,}")

# Attack type distribution
if type_col:
    print(f"\n2. Attack Types: '{type_col}'")
    type_dist = ton_data[type_col].value_counts()
    type_pct = (type_dist / len(ton_data) * 100)
    
    type_df = pd.DataFrame({
        'Count': type_dist,
        'Percentage': type_pct.round(2)
    })
    display(type_df)
    print(f"\nUnique attack types: {ton_data[type_col].nunique()}")
    
    # Identify which are Normal/Benign vs Attack types
    normal_keywords = ['normal', 'benign']
    normal_mask = ton_data[type_col].str.lower().isin(normal_keywords)
    normal_count = normal_mask.sum()
    attack_count = (~normal_mask).sum()
    
    print(f"\nBinary Classification (derived from type):")
    print(f"  Normal: {normal_count:,} ({normal_count/len(ton_data)*100:.2f}%)")
    print(f"  Attack: {attack_count:,} ({attack_count/len(ton_data)*100:.2f}%)")

In [None]:
if type_col:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Binary distribution
    if label_col_binary:
        binary_dist.plot(kind='bar', ax=axes[0], color=['green', 'red'])
        axes[0].set_title('TON_IoT: Binary Label Distribution', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('Label')
        axes[0].set_ylabel('Count')
        axes[0].tick_params(axis='x', rotation=45)
    else:
        # Use derived binary from type
        pd.Series({'Normal': normal_count, 'Attack': attack_count}).plot(
            kind='bar', ax=axes[0], color=['green', 'red']
        )
        axes[0].set_title('TON_IoT: Binary Classification (from Type)', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('Class')
        axes[0].set_ylabel('Count')
        axes[0].tick_params(axis='x', rotation=0)
    
    # Attack types
    type_dist.plot(kind='barh', ax=axes[1], color='darkorange')
    axes[1].set_title('TON_IoT: Attack Type Distribution', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Count')
    axes[1].set_ylabel('Type')
    
    plt.tight_layout()
    plt.show()