# Feature Engineering
This notebook will create a small set of robust, cross-dataset features derived from the canonical features.

## Goals
1. Normalize raw flow counts by time to capture traffic intensity (e.g., packets per second, bytes per second)
2. Capture flow density characteristics (e.g., average bytes per packet)
3. Encode directional asymmetry in traffic behavior (e.g., packet-level direction ratio, byte-level direction ratio)

In [1]:
!pip -q install "PyAthena[SQLAlchemy]" sqlalchemy s3fs

In [2]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

# Display settings
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Connect to Athena

In [3]:
sess = sagemaker.Session()
region = boto3.Session().region_name

results_bucket = sess.default_bucket()
athena_results_path = f"s3://{results_bucket}/athena/staging/"

database_name = "aai540_eda"

engine = create_engine(
    f"awsathena+rest://@athena.{region}.amazonaws.com:443/{database_name}",
    connect_args={"s3_staging_dir": athena_results_path, "region_name": region},
)
print("Region:", region)
print("Athena results:", athena_results_path)

Region: us-east-1
Athena results: s3://sagemaker-us-east-1-933747558592/athena/staging/


In [4]:
# Helper functions for queries
def exec_ddl(sql: str):
    with engine.begin() as conn:
        conn.execute(text(sql))

def read_sql(sql: str) -> pd.DataFrame:
    return pd.read_sql(sql, engine)

## Verify merged dataset

In [5]:
merged_dataset = read_sql(f"""
SELECT *
FROM {database_name}.merged_canonical_normalized
Limit 5
""")
merged_dataset

Unnamed: 0,duration,pkt_total,bytes_total,pkt_fwd,pkt_bwd,bytes_fwd,bytes_bwd,label,original_attack_type,attack_category,source_dataset
0,1.110921,30,11748,12,18,1580,10168,0,,Normal,UNSW-NB15
1,1.064872,30,11748,12,18,1580,10168,0,,Normal,UNSW-NB15
2,0.004351,36,3198,22,14,1470,1728,0,,Normal,UNSW-NB15
3,0.287203,34,3844,16,18,1272,2572,0,,Normal,UNSW-NB15
4,0.444692,248,23264,122,126,7816,15448,0,,Normal,UNSW-NB15


## Engineered Features
1. **`pkt_rate`** (*`pkt_rate = pkt_total / duration`*): Number of packets transmitted per second during the flow.
2. **`byte_rate`** (*`byte_rate = bytes_total / duration`*): Total bytes transferred per second during the flow.
3. **`bytes_per_pkt`** (*`bytes_per_pkt = bytes_total / (pkt_total + 1)`*): Average number of bytes carried per packet in the flow.
4. **`pkt_ratio`** (*`pkt_ratio = pkt_fwd / (pkt_bwd + 1)`*): Ratio of forward packets to backward packets in the flow.
5. **`byte_ratio`** (*`byte_ratio = bytes_fwd / (bytes_bwd + 1)`*): Ratio of forward bytes to backward bytes in the flow.

## **`pkt_rate`** (*`pkt_rate = pkt_total / duration`*)

This creates a new table version: `merged_canonical_normalized_v1`

In [6]:
# drop table if it already exists
exec_ddl(f"DROP TABLE IF EXISTS {database_name}.merged_canonical_normalized_v1")

# create table pkt_rate
exec_ddl(f"""
CREATE TABLE {database_name}.merged_canonical_normalized_v1
WITH (
  format = 'PARQUET',
  external_location = 's3://{results_bucket}/aai540/processed/merged_canonical_normalized_v1/',
  parquet_compression = 'SNAPPY'
) AS
SELECT
  *,
  CASE
    WHEN duration IS NULL OR duration <= 0 THEN NULL
    ELSE CAST(pkt_total AS DOUBLE) / CAST(duration AS DOUBLE)
  END AS pkt_rate
FROM {database_name}.merged_canonical_normalized
""")

In [7]:
read_sql(f"""
SELECT duration, pkt_total, pkt_rate
FROM {database_name}.merged_canonical_normalized_v1
pkt_total
Limit 25
""")

Unnamed: 0,duration,pkt_total,pkt_rate
0,6e-06,2,333333.333333
1,0.005269,2,379.578668
2,64.360595,5,0.077687
3,3.025532,3,0.991561
4,60.651444,23,0.379216
5,64.295541,5,0.077766
6,60.066597,5,0.083241
7,60.170508,5,0.083097
8,0.002379,2,840.689365
9,0.0,1,


### Sanity check

In [8]:
read_sql(f"""
SELECT
  COUNT(*) AS rows_total,
  SUM(CASE WHEN duration IS NULL OR duration <= 0 THEN 1 ELSE 0 END) AS bad_duration_rows,
  SUM(CASE WHEN (duration IS NULL OR duration <= 0) AND pkt_rate IS NOT NULL THEN 1 ELSE 0 END) AS pkt_rate_should_be_null_but_isnt
FROM {database_name}.merged_canonical_normalized_v1
""")


Unnamed: 0,rows_total,bad_duration_rows,pkt_rate_should_be_null_but_isnt
0,26708942,5325854,0


## **`byte_rate`** (*`byte_rate = bytes_total / duration`*)

This creates a new table version: `merged_canonical_normalized_v2`

In [9]:
# drop v2 if it already exists
exec_ddl(f"DROP TABLE IF EXISTS {database_name}.merged_canonical_normalized_v2")

# Create v2 with byte_rate (built on v1 so pkt_rate is retained)
exec_ddl(f"""
CREATE TABLE {database_name}.merged_canonical_normalized_v2
WITH (
  format = 'PARQUET',
  external_location = 's3://{results_bucket}/aai540/processed/merged_canonical_normalized_v2/',
  parquet_compression = 'SNAPPY'
) AS
SELECT
  *,
  CASE
    WHEN duration IS NULL OR duration <= 0 THEN NULL
    ELSE CAST(bytes_total AS DOUBLE) / CAST(duration AS DOUBLE)
  END AS byte_rate
FROM {database_name}.merged_canonical_normalized_v1
""")

In [10]:
read_sql(f"""
SELECT duration, bytes_total, byte_rate, pkt_total, pkt_rate
FROM {database_name}.merged_canonical_normalized_v2
WHERE duration IS NOT NULL
LIMIT 25
""")

Unnamed: 0,duration,bytes_total,byte_rate,pkt_total,pkt_rate
0,0.0,0,,1,
1,0.0,0,,1,
2,0.0,0,,1,
3,0.0,0,,1,
4,0.0,0,,1,
5,0.0,0,,1,
6,0.0,0,,1,
7,0.0,0,,1,
8,0.0,0,,1,
9,0.0,0,,1,


### Sanity check

In [11]:
read_sql(f"""
SELECT
  COUNT(*) AS rows_total,
  SUM(CASE WHEN duration IS NULL OR duration <= 0 THEN 1 ELSE 0 END) AS bad_duration_rows,
  SUM(CASE WHEN (duration IS NULL OR duration <= 0) AND byte_rate IS NOT NULL THEN 1 ELSE 0 END) AS byte_rate_should_be_null_but_isnt
FROM {database_name}.merged_canonical_normalized_v2
""")


Unnamed: 0,rows_total,bad_duration_rows,byte_rate_should_be_null_but_isnt
0,26708942,5325854,0


## **`bytes_per_pkt`** (*`bytes_per_pkt = bytes_total / (pkt_total + 1)`*)

This creates a new table version: `merged_canonical_normalized_v3`

In [12]:
# drop v3 if it already exists
exec_ddl(f"DROP TABLE IF EXISTS {database_name}.merged_canonical_normalized_v3")

# create v3 with bytes_per_pkt (built on v2 so prior features are retained)
exec_ddl(f"""
CREATE TABLE {database_name}.merged_canonical_normalized_v3
WITH (
  format = 'PARQUET',
  external_location = 's3://{results_bucket}/aai540/processed/merged_canonical_normalized_v3/',
  parquet_compression = 'SNAPPY'
) AS
SELECT
  *,
  CASE
    WHEN pkt_total IS NULL OR pkt_total <= 0 THEN NULL
    ELSE CAST(bytes_total AS DOUBLE) / CAST(pkt_total AS DOUBLE)
  END AS bytes_per_pkt
FROM {database_name}.merged_canonical_normalized_v2
""")

In [13]:
read_sql(f"""
SELECT duration, pkt_total, bytes_total, bytes_per_pkt
FROM {database_name}.merged_canonical_normalized_v3
WHERE bytes_total > 0
LIMIT 25
""")

Unnamed: 0,duration,pkt_total,bytes_total,bytes_per_pkt
0,0.000523,2,137,68.5
1,0.00058,2,212,106.0
2,0.000392,2,130,65.0
3,0.000504,2,212,106.0
4,0.000443,2,137,68.5
5,0.000336,2,130,65.0
6,0.000373,2,178,89.0
7,0.000535,2,137,68.5
8,0.000421,2,130,65.0
9,0.00059,2,212,106.0


### Sanity check

In [14]:
read_sql(f"""
SELECT
  COUNT(*) AS rows_total,
  SUM(CASE WHEN pkt_total IS NULL OR pkt_total <= 0 THEN 1 ELSE 0 END) AS bad_pkt_total_rows,
  SUM(CASE WHEN (pkt_total IS NULL OR pkt_total <= 0) AND bytes_per_pkt IS NOT NULL THEN 1 ELSE 0 END) AS bytes_per_pkt_should_be_null_but_isnt
FROM {database_name}.merged_canonical_normalized_v3
""")

Unnamed: 0,rows_total,bad_pkt_total_rows,bytes_per_pkt_should_be_null_but_isnt
0,26708942,115260,0


## **`pkt_ratio`** (*`pkt_ratio = pkt_fwd / (pkt_bwd + 1)`*)

This creates a new table version: `merged_canonical_normalized_v4`

In [15]:
# drop v4 if it already exists
exec_ddl(f"DROP TABLE IF EXISTS {database_name}.merged_canonical_normalized_v4")

# create v4 with pkt_ratio (built on v3 so prior features are retained)
exec_ddl(f"""
CREATE TABLE {database_name}.merged_canonical_normalized_v4
WITH (
  format = 'PARQUET',
  external_location = 's3://{results_bucket}/aai540/processed/merged_canonical_normalized_v4/',
  parquet_compression = 'SNAPPY'
) AS
SELECT
  *,
  CASE
    WHEN pkt_fwd IS NULL OR pkt_fwd < 0 THEN NULL
    WHEN pkt_bwd IS NULL OR pkt_bwd < 0 THEN NULL
    ELSE CAST(pkt_fwd AS DOUBLE) / (CAST(pkt_bwd AS DOUBLE) + 1.0)
  END AS pkt_ratio
FROM {database_name}.merged_canonical_normalized_v3
""")

In [16]:
read_sql(f"""
SELECT duration, pkt_total, pkt_fwd, pkt_bwd, pkt_ratio
FROM {database_name}.merged_canonical_normalized_v4
LIMIT 25
""")

Unnamed: 0,duration,pkt_total,pkt_fwd,pkt_bwd,pkt_ratio
0,0.0,1,1,0,1.0
1,0.0,1,1,0,1.0
2,9e-06,2,1,1,0.5
3,0.000972,2,1,1,0.5
4,4.3e-05,2,1,1,0.5
5,0.0,1,1,0,1.0
6,0.000126,2,1,1,0.5
7,0.016897,3,2,1,1.0
8,0.0,1,1,0,1.0
9,1.4e-05,2,1,1,0.5


## **`byte_ratio`** (*`byte_ratio = bytes_fwd / (bytes_bwd + 1)`*)

This creates a new table version: `merged_canonical_normalized_v5`

In [17]:
# drop v5 if it already exists
exec_ddl(f"DROP TABLE IF EXISTS {database_name}.merged_canonical_normalized_v5")

# Create v5 with byte_ratio (built on v4 so prior features are retained)
exec_ddl(f"""
CREATE TABLE {database_name}.merged_canonical_normalized_v5
WITH (
  format = 'PARQUET',
  external_location = 's3://{results_bucket}/aai540/processed/merged_canonical_normalized_v5/',
  parquet_compression = 'SNAPPY'
) AS
SELECT
  *,
  CASE
    WHEN bytes_fwd IS NULL OR bytes_fwd < 0 THEN NULL
    WHEN bytes_bwd IS NULL OR bytes_bwd < 0 THEN NULL
    ELSE CAST(bytes_fwd AS DOUBLE) / (CAST(bytes_bwd AS DOUBLE) + 1.0)
  END AS byte_ratio
FROM {database_name}.merged_canonical_normalized_v4
""")

In [18]:
read_sql(f"""
SELECT
  bytes_fwd,
  bytes_bwd,
  byte_ratio
FROM {database_name}.merged_canonical_normalized_v5
LIMIT 25
""")

Unnamed: 0,bytes_fwd,bytes_bwd,byte_ratio
0,39,39,0.975
1,470,1727,0.271991
2,43,43,0.977273
3,43,43,0.977273
4,153,107,1.416667
5,39,39,0.975
6,43,43,0.977273
7,550,1727,0.318287
8,202,171,1.174419
9,43,43,0.977273


In [19]:
## Finalize Feature Engineering Table

In [20]:
# drop final table if it already exists
exec_ddl(f"DROP TABLE IF EXISTS {database_name}.feature_engineered")

# create final feature_engineered table from v5
exec_ddl(f"""
CREATE TABLE {database_name}.feature_engineered
WITH (
  format = 'PARQUET',
  external_location = 's3://{results_bucket}/aai540/processed/feature_engineered/',
  parquet_compression = 'SNAPPY'
) AS
SELECT *
FROM {database_name}.merged_canonical_normalized_v5
""")


In [21]:
read_sql(f"""
SELECT
  pkt_rate,
  byte_rate,
  bytes_per_pkt,
  pkt_ratio,
  byte_ratio
FROM {database_name}.feature_engineered
LIMIT 10
""")

Unnamed: 0,pkt_rate,byte_rate,bytes_per_pkt,pkt_ratio,byte_ratio
0,45.086552,2164.155,48.0,1.428571,1.858736
1,500000.0,28500000.0,57.0,2.0,114.0
2,666666.666667,38000000.0,57.0,2.0,114.0
3,666666.666667,38000000.0,57.0,2.0,114.0
4,666666.666667,38000000.0,57.0,2.0,114.0
5,500000.0,28500000.0,57.0,2.0,114.0
6,500000.0,28500000.0,57.0,2.0,114.0
7,666666.666667,38000000.0,57.0,2.0,114.0
8,666666.666667,38000000.0,57.0,2.0,114.0
9,500000.0,28500000.0,57.0,2.0,114.0


## Clean Athena catalog

In [22]:
# tables_to_drop = [
#     "merged_canonical_normalized_v1",
#     "merged_canonical_normalized_v2",
#     "merged_canonical_normalized_v3",
#     "merged_canonical_normalized_v4",
#     "merged_canonical_normalized_v5",
# ]

# for t in tables_to_drop:
#     exec_ddl(f"DROP TABLE IF EXISTS {database_name}.{t}")


In [23]:
invalid_rows = read_sql("""
SELECT *
FROM aai540_eda.feature_engineered
WHERE
    duration IS NULL
 OR pkt_total IS NULL
 OR bytes_total IS NULL
 OR pkt_fwd IS NULL
 OR pkt_bwd IS NULL
 OR bytes_fwd IS NULL
 OR bytes_bwd IS NULL

 OR duration < 0
 OR pkt_total < 0
 OR bytes_total < 0
 OR pkt_fwd < 0
 OR pkt_bwd < 0
 OR bytes_fwd < 0
 OR bytes_bwd < 0
LIMIT 100
""")

invalid_rows


Unnamed: 0,duration,pkt_total,bytes_total,pkt_fwd,pkt_bwd,bytes_fwd,bytes_bwd,label,original_attack_type,attack_category,source_dataset,pkt_rate,byte_rate,bytes_per_pkt,pkt_ratio,byte_ratio
0,-1e-06,2,12,1,1,6,6,0,BENIGN,Normal,CIC-IDS2017,,,6.0,0.5,0.857143
1,-1e-06,2,6,1,1,6,0,0,BENIGN,Normal,CIC-IDS2017,,,3.0,0.5,6.0
2,-1e-06,2,12,1,1,6,6,0,BENIGN,Normal,CIC-IDS2017,,,6.0,0.5,0.857143
3,-1e-06,2,12,1,1,6,6,0,BENIGN,Normal,CIC-IDS2017,,,6.0,0.5,0.857143
4,-1e-06,2,8,1,1,6,2,0,BENIGN,Normal,CIC-IDS2017,,,4.0,0.5,2.0
5,-1e-06,2,8,1,1,6,2,0,BENIGN,Normal,CIC-IDS2017,,,4.0,0.5,2.0
6,-1e-06,2,12,1,1,6,6,0,BENIGN,Normal,CIC-IDS2017,,,6.0,0.5,0.857143
7,-1e-06,2,12,1,1,6,6,0,BENIGN,Normal,CIC-IDS2017,,,6.0,0.5,0.857143
8,-1e-06,2,0,1,1,0,0,0,BENIGN,Normal,CIC-IDS2017,,,0.0,0.5,0.0
9,-1e-06,2,12,1,1,6,6,0,BENIGN,Normal,CIC-IDS2017,,,6.0,0.5,0.857143
