In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
PROCESSED_DIR = os.path.join(".", "processed")

df_mon = pd.read_csv(os.path.join(PROCESSED_DIR, "monday_clean.csv"))
df_tue = pd.read_csv(os.path.join(PROCESSED_DIR, "tuesday_clean.csv"))
df_fri = pd.read_csv(os.path.join(PROCESSED_DIR, "friday_clean.csv"))

print(df_mon.shape, df_tue.shape, df_fri.shape)
df_mon.head(2)


(529918, 70) (445909, 70) (225745, 70)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,is_attack
0,49188,4,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,0
1,49188,1,2,0,12,0,6,6,6.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,0


In [3]:
df_attack_days = pd.concat([df_tue, df_fri], ignore_index=True)
print("Combined attack-days shape:", df_attack_days.shape)

# Quick check
df_attack_days["is_attack"].mean() * 100


Combined attack-days shape: (671654, 70)


21.121291617410154

In [4]:
required = [
    "Destination Port",
    "Flow Duration",
    "Flow Bytes/s",
    "Flow Packets/s",
    "SYN Flag Count",
    "RST Flag Count",
    "FIN Flag Count",
    "is_attack",
]
missing = [c for c in required if c not in df_attack_days.columns]
assert len(missing) == 0, f"Missing required columns: {missing}"
print("All required columns present ✅")


All required columns present ✅


In [5]:
def clip01(x):
    return np.clip(x, 0.0, 1.0)

def percentile_scale(series: pd.Series, low_q=0.05, high_q=0.95) -> pd.Series:
    """
    Scales a numeric series into 0..1 based on robust quantiles.
    Values below low_q -> 0, above high_q -> 1.
    """
    s = series.astype(float)
    lo = s.quantile(low_q)
    hi = s.quantile(high_q)
    denom = (hi - lo) if (hi - lo) != 0 else 1.0
    scaled = (s - lo) / denom
    return pd.Series(clip01(scaled), index=series.index)


In [14]:
df = df_attack_days.copy()

# Robustly scale behaviour signals
df["risk_bytes_rate"]   = percentile_scale(df["Flow Bytes/s"])
df["risk_pkts_rate"]    = percentile_scale(df["Flow Packets/s"])
df["risk_duration"]     = percentile_scale(df["Flow Duration"])

# TCP flag intensity (often useful for scans / floods)
flag_sum = (df["SYN Flag Count"] + df["RST Flag Count"] + df["FIN Flag Count"]).astype(float)
df["risk_flags"] = percentile_scale(flag_sum)

# Attack presence signal (label-based, used only for risk profiling)
df["risk_attack_label"] = df["is_attack"].astype(float)
df[["risk_bytes_rate","risk_pkts_rate","risk_duration","risk_flags","risk_attack_label"]].head(10)


Unnamed: 0,risk_bytes_rate,risk_pkts_rate,risk_duration,risk_flags,risk_attack_label
0,0.741331,0.154686,6e-06,0.0,0.0
1,1.0,0.129999,9e-06,0.0,0.0
2,1.0,0.082156,1.2e-05,0.0,0.0
3,0.956401,0.193736,5e-06,0.0,0.0
4,1.0,0.151357,8e-06,0.0,0.0
5,1.0,0.136915,1e-05,0.0,0.0
6,1.0,0.144384,9e-06,0.0,0.0
7,0.005289,0.000423,0.00584,0.0,0.0
8,0.0,1.0,0.0,1.0,0.0
9,1.0,0.150696,1.1e-05,0.0,0.0


In [7]:
# Weights (explainable): traffic rate + packet rate + flags + duration + label presence
W = {
    "risk_bytes_rate": 0.25,
    "risk_pkts_rate": 0.25,
    "risk_flags": 0.20,
    "risk_duration": 0.10,
    "risk_attack_label": 0.20,
}

df["risk_score_flow_0_1"] = (
    W["risk_bytes_rate"] * df["risk_bytes_rate"] +
    W["risk_pkts_rate"]  * df["risk_pkts_rate"]  +
    W["risk_flags"]      * df["risk_flags"]      +
    W["risk_duration"]   * df["risk_duration"]   +
    W["risk_attack_label"] * df["risk_attack_label"]
)

df["risk_score_flow"] = (df["risk_score_flow_0_1"] * 100).round(2)
df[["Destination Port", "risk_score_flow", "is_attack"]].head()


Unnamed: 0,Destination Port,risk_score_flow,is_attack
0,88,22.4,0
1,88,28.25,0
2,88,27.05,0
3,88,28.75,0
4,88,28.78,0


In [8]:
group = df.groupby("Destination Port", as_index=False).agg(
    total_flows=("Destination Port", "size"),
    attack_flows=("is_attack", "sum"),
    attack_pct=("is_attack", "mean"),
    avg_risk=("risk_score_flow", "mean"),
    p95_risk=("risk_score_flow", lambda x: float(np.percentile(x, 95))),
    avg_bytes_rate=("Flow Bytes/s", "mean"),
    avg_pkts_rate=("Flow Packets/s", "mean"),
)

group["attack_pct"] = (group["attack_pct"] * 100).round(3)

# A port-level overall risk score (blend of attack% and high-risk tail)
group["risk_score_port"] = (
    0.55 * percentile_scale(group["attack_pct"]) +
    0.45 * percentile_scale(group["p95_risk"])
)
group["risk_score_port"] = (group["risk_score_port"] * 100).round(2)

group.sort_values("risk_score_port", ascending=False).head(15)


Unnamed: 0,Destination Port,total_flows,attack_flows,attack_pct,avg_risk,p95_risk,avg_bytes_rate,avg_pkts_rate,risk_score_port
1,21,9163,7937,86.62,35.243885,90.0,698879.5,110907.2,100.0
2,22,8375,5897,70.412,17.717285,29.57,10147.87,17951.93,73.89
5,80,182627,128025,70.102,16.83852,29.12,47283.95,3189.444,73.6
12317,27636,1,1,100.0,20.4,20.4,9.099211,1.516535,67.97
39804,64873,2,1,50.0,10.45,19.612,5194.036,3.608864,67.46
39800,64869,2,1,50.0,10.45,19.612,5058.158,4.042094,67.46
2014,4019,1,0,0.0,70.0,70.0,7750000.0,500000.0,45.0
7124,11000,1,0,0.0,70.0,70.0,10300000.0,666666.7,45.0
37979,62920,1,0,0.0,70.0,70.0,12300000.0,666666.7,45.0
7099,10973,1,0,0.0,70.0,70.0,31000000.0,2000000.0,45.0


In [9]:
def risk_bucket(score):
    if score >= 80: return "Critical"
    if score >= 60: return "High"
    if score >= 35: return "Medium"
    return "Low"

port_risk = group.copy()
port_risk["risk_level"] = port_risk["risk_score_port"].apply(risk_bucket)

port_risk = port_risk.sort_values(
    ["risk_level", "risk_score_port", "attack_pct", "total_flows"],
    ascending=[True, False, False, False]
)

# Show top risky ports
top_ports = port_risk.sort_values("risk_score_port", ascending=False).head(20)
top_ports


Unnamed: 0,Destination Port,total_flows,attack_flows,attack_pct,avg_risk,p95_risk,avg_bytes_rate,avg_pkts_rate,risk_score_port,risk_level
1,21,9163,7937,86.62,35.243885,90.0,698879.5,110907.2,100.0,Critical
2,22,8375,5897,70.412,17.717285,29.57,10147.87,17951.93,73.89,High
5,80,182627,128025,70.102,16.83852,29.12,47283.95,3189.444,73.6,High
12317,27636,1,1,100.0,20.4,20.4,9.099211,1.516535,67.97,High
39800,64869,2,1,50.0,10.45,19.612,5058.158,4.042094,67.46,High
39804,64873,2,1,50.0,10.45,19.612,5194.036,3.608864,67.46,High
6943,10784,1,0,0.0,70.0,70.0,10300000.0,666666.7,45.0,Medium
6935,10775,1,0,0.0,70.0,70.0,10300000.0,666666.7,45.0,Medium
6936,10777,1,0,0.0,70.0,70.0,7750000.0,500000.0,45.0,Medium
6937,10778,1,0,0.0,70.0,70.0,31000000.0,2000000.0,45.0,Medium


In [10]:
OUT_DIR = os.path.join(".", "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

out_path = os.path.join(OUT_DIR, "identify_port_risk_report.csv")
top_path = os.path.join(OUT_DIR, "identify_top20_ports.csv")

port_risk.to_csv(out_path, index=False)
top_ports.to_csv(top_path, index=False)

print("Saved:")
print(out_path)
print(top_path)


Saved:
.\outputs\identify_port_risk_report.csv
.\outputs\identify_top20_ports.csv


In [11]:
explain = """
IDENTIFY (NIST CSF) - Risk Scoring Summary

In this step, we treated each Destination Port as a monitored service (asset/service perspective).
We used Tuesday and Friday traffic (attack scenarios) to measure exposure and risk.
A port is considered higher risk when it shows:
- a higher percentage of attack-labelled flows, and/or
- a higher tail of extreme flow-level risk (95th percentile risk score).

The output is a ranked table of ports with a clear risk score and risk level
that can be used to prioritise monitoring and defensive controls.
""".strip()

print(explain)


IDENTIFY (NIST CSF) - Risk Scoring Summary

In this step, we treated each Destination Port as a monitored service (asset/service perspective).
We used Tuesday and Friday traffic (attack scenarios) to measure exposure and risk.
A port is considered higher risk when it shows:
- a higher percentage of attack-labelled flows, and/or
- a higher tail of extreme flow-level risk (95th percentile risk score).

The output is a ranked table of ports with a clear risk score and risk level
that can be used to prioritise monitoring and defensive controls.


In [None]:
# Checking Done
df[df["is_attack"] == 1][
    ["risk_bytes_rate", "risk_pkts_rate", "risk_flags", "risk_attack_label"]
].describe()


Unnamed: 0,risk_bytes_rate,risk_pkts_rate,risk_flags,risk_attack_label
count,141862.0,141862.0,141862.0,141862.0
mean,0.034062,0.017655,0.028112,1.0
std,0.114641,0.104589,0.165293,0.0
min,0.0,0.0,0.0,1.0
25%,4e-06,3e-06,0.0,1.0
50%,9.5e-05,3.3e-05,0.0,1.0
75%,0.010348,0.000141,0.0,1.0
max,1.0,1.0,1.0,1.0
