In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, kruskal, levene, chi2_contingency, spearmanr

In [2]:
# Load cleaned dataset
df = pd.read_csv('../outputs/cleaned_data.csv', low_memory=False)

In [3]:
# 0) Save dataset signature and basic provenance info
import hashlib, json, os
from datetime import datetime

def dataset_signature(df):
    sig = {
        'rows': int(df.shape[0]),
        'cols': int(df.shape[1]),
        'columns': list(df.columns),
        'dtypes': {c: str(df[c].dtype) for c in df.columns},
        'null_counts': {c: int(df[c].isna().sum()) for c in df.columns},
        'sample_head': df.head(3).to_dict(orient='records'),
        'timestamp': datetime.utcnow().isoformat() + 'Z'
    }
    # lightweight content hash (of a small sample)
    sample = df.sample(min(1000, max(1, len(df))), random_state=42).to_csv(index=False).encode('utf-8')
    sig['sample_sha1'] = hashlib.sha1(sample).hexdigest()
    return sig

sig = dataset_signature(df)
os.makedirs('../outputs', exist_ok=True)
with open('../outputs/dataset_signature.json', 'w') as f:
    json.dump(sig, f, indent=2)
print("Dataset signature saved to ../outputs/dataset_signature.json")
print("rows:", sig['rows'], "cols:", sig['cols'], "sample_sha1:", sig['sample_sha1'])

Dataset signature saved to ../outputs/dataset_signature.json
rows: 2522362 cols: 80 sample_sha1: e0867d20a9f9d9703a3a0fdafe50462e9e8a171e


In [5]:
df

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack_Binary
0,54865,3.0,2,0,12,0,6,6,6.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
1,55054,109.0,1,1,6,6,6,6,6.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
2,55055,52.0,1,1,6,6,6,6,6.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
3,46236,34.0,1,1,6,6,6,6,6.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
4,54863,3.0,2,0,12,0,6,6,6.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522357,53,32215.0,4,2,112,152,28,28,28.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
2522358,53,324.0,2,2,84,362,42,42,42.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
2522359,58030,82.0,2,1,31,6,31,0,15.5,21.92031,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN
2522360,53,1048635.0,6,2,192,256,32,32,32.0,0.00000,...,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,BENIGN


In [7]:
df['Label'].unique()

array(['BENIGN', 'DDOS', 'PORTSCAN', 'BOT', 'INFILTRATION',
       'WEB ATTACK � BRUTE FORCE', 'WEB ATTACK � XSS',
       'WEB ATTACK � SQL INJECTION', 'FTP-PATATOR', 'SSH-PATATOR',
       'DOS SLOWLORIS', 'DOS SLOWHTTPTEST', 'DOS HULK', 'DOS GOLDENEYE',
       'HEARTBLEED'], dtype=object)

In [None]:
# 1) Normalize column names (strip/collapse spaces)
df.columns = [(" ".join(str(c).split())).strip() for c in df.columns]

In [8]:
# 2) Ensure label and Attack_Binary exist
label_col = None
for c in df.columns:
    if c.lower().strip() in ('label','class','attack','attack_label'):
        label_col = c
        break
if label_col is None:
    # fallback: try to auto-detect a column with BENIGN in sample
    for c in df.columns:
        if df[c].dtype == object:
            s = df[c].dropna().astype(str).str.upper().head(200)
            if any('BENIGN' in v for v in s):
                label_col = c
                break
if label_col is None:
    raise RuntimeError("Label column not found. Edit notebook to set label_col manually.")
# normalize label values
df[label_col] = df[label_col].astype(str).str.strip().str.upper()
if 'Attack_Binary' not in df.columns:
    df['Attack_Binary'] = df[label_col].apply(lambda x: 'BENIGN' if 'BENIGN' in str(x).upper() else 'ATTACK')

In [16]:
df.columns
# a = list(df.columns)


Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [9]:
# # QQ) Robust Chi-square test for Protocol vs Attack (H4)
# import numpy as np
# from scipy.stats import chi2_contingency
# 
# def safe_chi2(df, col1, col2, collapse_rare=True, min_count=10, collapse_label='OTHER'):
#     # build crosstab
#     cont = pd.crosstab(df[col1].fillna('UNKNOWN'), df[col2].fillna('UNKNOWN'))
#     # remove empty rows/cols
#     cont = cont.loc[(cont.sum(axis=1) > 0), (cont.sum(axis=0) > 0)]
#     reason = []
#     if cont.shape[0] < 2 or cont.shape[1] < 2:
#         reason.append('Not enough categories after dropping empty rows/cols')
#         return None, None, cont, reason
#     # collapse very rare rows if requested (to avoid tiny expected counts)
#     if collapse_rare:
#         row_totals = cont.sum(axis=1)
#         rare = row_totals[row_totals < min_count].index.tolist()
#         if rare:
#             reason.append(f'Found rare categories (rows) <{min_count}: {rare}. Collapsing into {collapse_label}.')
#             other_row = cont.loc[rare].sum(axis=0)
#             cont = cont.drop(index=rare)
#             cont.loc[collapse_label] = other_row
#     # re-check
#     cont = cont.loc[(cont.sum(axis=1) > 0), (cont.sum(axis=0) > 0)]
#     if cont.shape[0] < 2 or cont.shape[1] < 2:
#         reason.append('Insufficient categories after collapsing rare ones.')
#         return None, None, cont, reason
#     # compute expected
#     chi2, p, dof, expected = chi2_contingency(cont)
#     # check expected cell counts
#     min_expected = expected.min()
#     if min_expected < 5:
#         reason.append(f'Min expected cell < 5 ({min_expected:.3f}) — chi-square assumptions weak.')
#     return (chi2, p, dof, expected), cont, reason
# 
# res, cont_table, reasons = safe_chi2(df, 'Protocol', 'Attack_Binary', collapse_rare=True, min_count=50)
# print("Result:", res)
# print("Contingency table shape:", cont_table.shape)
# print("Reasons/notes:", reasons)
# if res is not None:
#     chi2, p, dof, expected = res
#     print(f"Chi2 stat={chi2:.4f} p={p:.6f} dof={dof} min_expected={expected.min():.3f}")


KeyError: 'Protocol'

In [17]:
# -----------------------------
# A: Detect protocol-like column or create a fallback from ports
# -----------------------------
import pandas as pd, numpy as np
from collections import Counter

# helper to test for protocol-like values inside a series
def looks_like_protocol_series(s, check_n=500):
    # sample values and check for TCP/UDP/ICMP keywords
    sample = s.dropna().astype(str).str.upper().head(check_n).tolist()
    kw = {'TCP','UDP','ICMP'}
    found = any(any(k in v for k in kw) for v in sample)
    unique_count = len(pd.unique(sample))
    return found, unique_count, sample[:10]

# 1: look for common names that might be 'Protocol'
candidates = []
for c in df.columns:
    if df[c].dtype == object or df[c].dtype == 'O':
        found, uniq, samp = looks_like_protocol_series(df[c])
        if found:
            candidates.append((c, uniq, samp))

# check numeric columns that might be ports
port_candidates = []
for c in df.columns:
    if 'PORT' in c.upper() or 'DPORT' in c.upper() or 'DEST' in c.upper():
        if pd.api.types.is_integer_dtype(df[c]) or pd.api.types.is_float_dtype(df[c]):
            port_candidates.append(c)

print("Protocol-like object columns found:", candidates)
print("Potential port columns found (numeric):", port_candidates)

# Decide protocol_col
protocol_col = None
if len(candidates) > 0:
    protocol_col = candidates[0][0]
    print(f"Using detected protocol-like column: {protocol_col}")
else:
    print("No protocol-like textual column found.")
    # fallback: if we have Destination/Source Port, use it to create protocol proxy
    # choose best port column available
    if len(port_candidates) > 0:
        port_col = port_candidates[0]
        print("Using numeric port column as proxy:", port_col)
        protocol_col = '__Protocol_from_Port__'
        # create a simple mapping of common ports -> protocol-like labels
        def port_to_proto(p):
            try:
                p = int(p)
            except:
                return 'UNKNOWN'
            # ICMP has no port; we can't detect it via ports. We'll just mark as TCP/UDP/OTHER by well-known ports.
            if p in (80, 443, 8080, 20, 21, 22, 23, 25, 110, 143):
                return 'TCP_HTTP_FTP_SSH'
            if p in (53,):  # DNS
                return 'UDP_DNS'
            if 0 < p < 1024:
                return 'WELL_KNOWN_PORT'
            if p >= 1024:
                return 'EPHEMERAL'
            return 'OTHER'
        df[protocol_col] = df[port_col].apply(port_to_proto)
        print(f"Created fallback column '{protocol_col}' from port '{port_col}'.")
    else:
        print("No port-like column found. Protocol-based chi-square cannot be executed automatically.")
        protocol_col = None

# summary
if protocol_col is not None:
    print("Protocol column chosen:", protocol_col)
    print("Protocol value sample:", pd.Series(df[protocol_col].dropna().astype(str)).unique()[:10])
else:
    print("Protocol analysis will be skipped; please add a protocol column or choose port column manually.")


  unique_count = len(pd.unique(sample))
  unique_count = len(pd.unique(sample))


Protocol-like object columns found: []
Potential port columns found (numeric): ['Destination Port']
No protocol-like textual column found.
Using numeric port column as proxy: Destination Port
Created fallback column '__Protocol_from_Port__' from port 'Destination Port'.
Protocol column chosen: __Protocol_from_Port__
Protocol value sample: ['EPHEMERAL' 'UDP_DNS' 'TCP_HTTP_FTP_SSH' 'WELL_KNOWN_PORT' 'OTHER']


In [None]:
# -----------------------------
# B: Robust Chi-square (H4) using protocol_col if available
# -----------------------------
from scipy.stats import chi2_contingency
import numpy as np

def safe_chi2_using_col(df, protocol_col, attack_col='Attack_Binary', min_count=50, collapse_label='OTHER'):
    if protocol_col is None:
        print("No protocol column available: skipping Chi-square H4.")
        return None
    if protocol_col not in df.columns:
        print(f"Protocol column {protocol_col} missing in df. Skipping H4.")
        return None

    cont = pd.crosstab(df[protocol_col].fillna('UNKNOWN').astype(str).str.upper(), 
                       df[attack_col].fillna('UNKNOWN').astype(str).str.upper())

    # drop empty rows/cols
    cont = cont.loc[(cont.sum(axis=1) > 0), (cont.sum(axis=0) > 0)]
    if cont.shape[0] < 2 or cont.shape[1] < 2:
        print("Contingency table too small after dropping empty rows/cols. Returning None.")
        return None

    # collapse rare rows into OTHER if needed
    row_totals = cont.sum(axis=1)
    rare = row_totals[row_totals < min_count].index.tolist()
    if rare:
        print("Rare protocol categories (below min_count) found:", rare)
        other_row = cont.loc[rare].sum(axis=0)
        cont = cont.drop(index=rare)
        cont.loc[collapse_label] = other_row

    # re-evaluate
    cont = cont.loc[(cont.sum(axis=1) > 0), (cont.sum(axis=0) > 0)]
    if cont.shape[0] < 2 or cont.shape[1] < 2:
        print("Insufficient categories after collapsing rare ones.")
        return None

    try:
        chi2, p, dof, expected = chi2_contingency(cont)
    except Exception as e:
        print("chi2_contingency threw exception:", str(e))
        return None

    min_expected = expected.min()
    notes = []
    if min_expected < 5:
        notes.append(f"Min expected cell count below 5: {min_expected:.3f} (chi-square assumption weak).")

    print("Chi-square result: chi2=", chi2, "p=", p, "dof=", dof)
    print("Contingency table shape:", cont.shape)
    if notes:
        print("Notes:", notes)

    # Save contingency and expected for inspection
    cont.to_csv('../outputs/H4_contingency_table.csv')
    pd.DataFrame(expected, index=cont.index, columns=cont.columns).to_csv('../outputs/H4_expected_table.csv')
    print("Saved contingency and expected tables to ../outputs/")
kz
    return dict(chi2=chi2, p=p, dof=dof, min_expected=float(min_expected), notes=notes)

h4_result = safe_chi2_using_col(df, protocol_col, attack_col='Attack_Binary', min_count=50)
print("H4 result summary:", h4_result)


Chi-square result: chi2= 360394.2049374206 p= 0.0 dof= 4
Contingency table shape: (5, 2)
Saved contingency and expected tables to ../outputs/
H4 result summary: {'chi2': np.float64(360394.2049374206), 'p': np.float64(0.0), 'dof': 4, 'min_expected': 264.5737709337518, 'notes': []}


In [20]:
# -----------------------------
# C: Run Spearman correlations (H5) for numeric features vs Attack_Binary
# -----------------------------
from scipy.stats import spearmanr

# ensure Attack_Binary normalized and numeric mapping exists
df['Attack_Binary'] = df['Attack_Binary'].astype(str).str.strip().str.upper()
if '_attack_num' not in df.columns:
    df['_attack_num'] = df['Attack_Binary'].map({'BENIGN':0, 'ATTACK':1})

# identify numeric features (exclude the attack num column)
numeric_features = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in ['_attack_num']]
print("Numeric features count:", len(numeric_features))

corr_results = {}
min_n = 50
for f in numeric_features:
    valid = df[[f, '_attack_num']].dropna()
    n = len(valid)
    if n >= min_n:
        r, p = spearmanr(valid[f], valid['_attack_num'])
        # spearmanr may return nan for constant arrays; handle gracefully
        if np.isnan(r):
            corr_results[f] = {'spearman_r': None, 'p_value': None, 'n': n, 'note': 'constant_or_nan'}
        else:
            corr_results[f] = {'spearman_r': float(r), 'p_value': float(p), 'n': n}
    else:
        corr_results[f] = {'spearman_r': None, 'p_value': None, 'n': n, 'note': 'n_too_small'}

# save to CSV for inspection
pd.DataFrame.from_dict(corr_results, orient='index').to_csv('../outputs/feature_attack_correlation_fixed.csv')
print("Spearman results saved to ../outputs/feature_attack_correlation_fixed.csv")

# quick top entries to display
df_corr = pd.DataFrame.from_dict(corr_results, orient='index')
top = df_corr.dropna(subset=['p_value']).sort_values('p_value').head(10)
print("Top 10 features by Spearman p-value:")
print(top)

Numeric features count: 78


  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])
  r, p = spearmanr(valid[f], valid['_attack_num'])


Spearman results saved to ../outputs/feature_attack_correlation_fixed.csv
Top 10 features by Spearman p-value:
                             spearman_r  p_value        n note
Destination Port               0.038293      0.0  2522362  NaN
Flow Duration                  0.211097      0.0  2522362  NaN
Total Fwd Packets              0.159270      0.0  2522362  NaN
Total Backward Packets         0.089112      0.0  2522362  NaN
Total Length of Fwd Packets   -0.080580      0.0  2522362  NaN
Total Length of Bwd Packets    0.182795      0.0  2522362  NaN
Fwd Packet Length Max         -0.084631      0.0  2522362  NaN
Fwd Packet Length Min         -0.315510      0.0  2522362  NaN
Fwd Packet Length Mean        -0.115121      0.0  2522362  NaN
Fwd Packet Length Std          0.176839      0.0  2522362  NaN


In [21]:
# --- Validation + human-friendly p-value reporting for H4 & H5 ---
import numpy as np, pandas as pd
from scipy.stats import chi2_contingency, spearmanr, norm
import math, json, os

os.makedirs('../outputs', exist_ok=True)

# 0) Basic checks
print("Rows, cols:", df.shape)
print("Attack_Binary unique values:", pd.Series(df['Attack_Binary']).unique())
print("Counts Attack_Binary:")
print(df['Attack_Binary'].value_counts(dropna=False))

# 1) H4: contingency and expected (if protocol_col exists)
protocol_col = 'Protocol' if 'Protocol' in df.columns else ('__Protocol_from_Port__' if '__Protocol_from_Port__' in df.columns else None)
print("\nProtocol column used:", protocol_col)
if protocol_col is not None:
    cont = pd.crosstab(df[protocol_col].fillna('UNKNOWN').astype(str).str.upper(), df['Attack_Binary'].fillna('UNKNOWN').astype(str).str.upper())
    cont = cont.loc[(cont.sum(axis=1) > 0), (cont.sum(axis=0) > 0)]
    print("Contingency table (top):")
    print(cont)
    try:
        chi2, p_chi, dof, expected = chi2_contingency(cont)
        print("\nChi-square: stat=", chi2, "p(raw)=", p_chi, "dof=", dof)
        print("min expected cell:", expected.min())
        # Save for viva
        cont.to_csv('../outputs/H4_contingency_table.csv')
        pd.DataFrame(expected, index=cont.index, columns=cont.columns).to_csv('../outputs/H4_expected_table.csv')
        print("Saved H4 contingency & expected to ../outputs/")
        # If p_chi == 0.0, give numeric-stable log p estimate from chi2 tail using survival function
        if p_chi == 0.0:
            # compute log p via survival function using chi2 CDF with logsf if available
            try:
                from scipy.stats import chi2
                logp = chi2.logsf(chi2, df=dof)  # logsf may underflow; but try
                if math.isfinite(logp):
                    print("chi2.logsf (log p) approx:", logp, " => p ~ 10^{%.2f}" % (logp / math.log(10)))
                else:
                    print("chi2.logsf underflowed/inf; p is extremely small (< ~1e-300).")
            except Exception as e:
                print("Could not compute chi2.logsf (exception):", e)
    except Exception as e:
        print("chi2_contingency exception:", e)
else:
    print("No protocol column available; skipped H4.")

# 2) H5: Spearman for numeric features vs Attack_Binary (human-friendly p)
print("\nRunning Spearman diagnostics (numeric features). This may take a moment.")
if '_attack_num' not in df.columns:
    df['_attack_num'] = df['Attack_Binary'].map({'BENIGN':0, 'ATTACK':1})

numeric_features = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in ['_attack_num']]
print("Numeric feature count:", len(numeric_features))

def spearman_with_logp(x, y):
    # compute spearman r,p from scipy
    r, p = spearmanr(x, y)
    # robust approx: convert r to z (approx) and get log p via normal tail for large n
    n = len(x)
    if np.isnan(r):
        return r, p, None, None  # r,p may be nan
    # For large n, t ~ r * sqrt((n-2)/(1-r^2)), approximate two-sided p using normal on z = r*sqrt(n-1)
    try:
        z = r * math.sqrt(max(1, n-1))
        # two-sided p via normal survival: p = 2 * norm.sf(|z|)
        # use logsf to avoid underflow
        log_p_one_side = norm.logsf(abs(z))
        log_p_two_side = math.log(2.0) + log_p_one_side
        # convert log p to decimal exponent
        if math.isfinite(log_p_two_side):
            p_approx = math.exp(log_p_two_side)
        else:
            p_approx = 0.0
        return r, p, log_p_two_side, p_approx
    except Exception as e:
        return r, p, None, None

rows = []
for f in numeric_features:
    valid = df[[f, '_attack_num']].dropna()
    n = len(valid)
    if n < 10:
        rows.append((f, None, None, n, 'n_too_small', None))
        continue
    r, p_raw, logp, p_approx = spearman_with_logp(valid[f], valid['_attack_num'])
    # Prepare human-friendly p display
    if p_raw == 0.0 or (p_approx == 0.0):
        p_display = "<1e-300"
    elif p_approx is not None:
        p_display = "{:.2e}".format(p_approx)
    else:
        p_display = "{:.2e}".format(p_raw) if p_raw is not None and not math.isnan(p_raw) else "nan"
    rows.append((f, r, p_display, n, 'ok' if not math.isnan(r) else 'constant_or_nan', logp))

# save a summary
df_rows = pd.DataFrame(rows, columns=['feature','spearman_r','p_display','n','status','logp'])
df_rows.to_csv('../outputs/feature_attack_correlation_checked.csv', index=False)
print("Saved friendly Spearman output to ../outputs/feature_attack_correlation_checked.csv")
print(df_rows.head(15))
print("\nTop small-correlations note: r indicates effect size; p (display) may be <1e-300 due to huge n.")


Rows, cols: (2522362, 82)
Attack_Binary unique values: ['BENIGN' 'ATTACK']
Counts Attack_Binary:
Attack_Binary
BENIGN    2096484
ATTACK     425878
Name: count, dtype: int64

Protocol column used: __Protocol_from_Port__
Contingency table (top):
Attack_Binary           ATTACK  BENIGN
__Protocol_from_Port__                
EPHEMERAL                76526  468419
OTHER                        6    1561
TCP_HTTP_FTP_SSH        335940  700240
UDP_DNS                    101  876783
WELL_KNOWN_PORT          13305   49481

Chi-square: stat= 360394.2049374206 p(raw)= 0.0 dof= 4
min expected cell: 264.5737709337518
Saved H4 contingency & expected to ../outputs/
Could not compute chi2.logsf (exception): unsupported operand type(s) for -: 'chi2_gen' and 'int'

Running Spearman diagnostics (numeric features). This may take a moment.
Numeric feature count: 78


  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)
  r, p = spearmanr(x, y)


Saved friendly Spearman output to ../outputs/feature_attack_correlation_checked.csv
                        feature  spearman_r p_display        n status  \
0              Destination Port    0.038293   <1e-300  2522362     ok   
1                 Flow Duration    0.211097   <1e-300  2522362     ok   
2             Total Fwd Packets    0.159270   <1e-300  2522362     ok   
3        Total Backward Packets    0.089112   <1e-300  2522362     ok   
4   Total Length of Fwd Packets   -0.080580   <1e-300  2522362     ok   
5   Total Length of Bwd Packets    0.182795   <1e-300  2522362     ok   
6         Fwd Packet Length Max   -0.084631   <1e-300  2522362     ok   
7         Fwd Packet Length Min   -0.315510   <1e-300  2522362     ok   
8        Fwd Packet Length Mean   -0.115121   <1e-300  2522362     ok   
9         Fwd Packet Length Std    0.176839   <1e-300  2522362     ok   
10        Bwd Packet Length Max    0.214730   <1e-300  2522362     ok   
11        Bwd Packet Length Min   -0.295

In [19]:
# 3) Coerce needed numeric columns (edit the list if your dataset uses different names)
features = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
            'Flow Bytes/s', 'Flow Packets/s']
for f in features:
    if f in df.columns:
        df[f] = pd.to_numeric(df[f], errors='coerce')

In [5]:
# 4) Helper: safe tests that skip when missing / insufficient data
def safe_mannwhitney(col):
    if col not in df.columns:
        return None, None, 0, 0
    a = df[df['Attack_Binary']=='BENIGN'][col].dropna().astype(float)
    b = df[df['Attack_Binary']=='ATTACK'][col].dropna().astype(float)
    if len(a) < 10 or len(b) < 10:
        return None, None, len(a), len(b)
    stat, p = mannwhitneyu(a, b, alternative='two-sided')
    return stat, p, len(a), len(b)

def safe_kruskal(col):
    if col not in df.columns or label_col not in df.columns:
        return None, None
    groups = [g[col].dropna().astype(float) for _, g in df.groupby(label_col) if len(g[col].dropna()) >= 30]
    if len(groups) < 2:
        return None, None
    stat, p = kruskal(*groups)
    return stat, p

# 5) Run Hypotheses
results = []

In [6]:
# H1: Flow Duration (Benign vs Attack)
stat, p, na, nb = safe_mannwhitney('Flow Duration')
results.append(['H1', 'Flow Duration difference', 'Mann-Whitney U', 'Flow Duration', stat, p, na, nb])

In [7]:
# H2: Packets per flow differ between attack types (Kruskal-Wallis)
stat2, p2 = safe_kruskal('Total Fwd Packets')
results.append(['H2', 'Packets per flow by attack type', 'Kruskal-Wallis', 'Total Fwd Packets', stat2, p2, None, None])

In [8]:
# H3: Bytes per second variance differs (Levene)
if 'Flow Bytes/s' in df.columns:
    a = df[df['Attack_Binary']=='BENIGN']['Flow Bytes/s'].dropna().astype(float)
    b = df[df['Attack_Binary']=='ATTACK']['Flow Bytes/s'].dropna().astype(float)
    if len(a) >= 10 and len(b) >= 10:
        stat3, p3 = levene(a, b)
        results.append(['H3', 'Variance in Bytes/s', 'Levene', 'Flow Bytes/s', stat3, p3, len(a), len(b)])
    else:
        results.append(['H3', 'Variance in Bytes/s', 'Levene', 'Flow Bytes/s', None, None, len(a), len(b)])
else:
    results.append(['H3', 'Variance in Bytes/s', 'Levene', 'Flow Bytes/s', None, None, 0, 0])

In [None]:
# H4: Protocol associated with attack likelihood (Chi-square)
# if 'Protocol' in df.columns:
#     try:
#         cont = pd.crosstab(df['Protocol'].fillna('UNKNOWN'), df['Attack_Binary'])
#         stat4, p4, dof, expected = chi2_contingency(cont)
#         results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', 'Protocol', stat4, p4, None, None])
#     except Exception as e:
#         results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', 'Protocol', None, str(e), None, None])
# else:
#     results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', 'Protocol', None, None, None, None])
if '__Protocol_from_Port__' in df.columns:
    try:
        cont = pd.crosstab(df['Protocol'].fillna('UNKNOWN'), df['Attack_Binary'])
        stat4, p4, dof, expected = chi2_contingency(cont)
        results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', '__Protocol_from_Port__', stat4, p4, None, None])
    except Exception as e:
        results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', '__Protocol_from_Port__', None, str(e), None, None])
else:
    results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', '__Protocol_from_Port__', None, None, None, None])

In [10]:
# H5: Spearman correlation between features and Attack presence
# ensure numeric attack column
df['_attack_num'] = df['Attack_Binary'].map({'BENIGN':0, 'ATTACK':1})
corr_series = {}
for f in features:
    if f in df.columns:
        valid = df[[f, '_attack_num']].dropna()
        if len(valid) >= 50:
            corr, p_corr = spearmanr(valid[f], valid['_attack_num'])
            corr_series[f] = (float(corr), float(p_corr), len(valid))
        else:
            corr_series[f] = (None, None, len(valid))
    else:
        corr_series[f] = (None, None, 0)

In [11]:
# 6) Save and report
res_df = pd.DataFrame(results, columns=['Hypothesis','Description','Test','Column','Statistic','p-value','nA','nB'])
# Inference column: only if p-value numeric
def infer(p):
    try:
        return 'Significant' if float(p) < 0.05 else 'Not Significant'
    except:
        return 'Not Tested/Insufficient Data'
res_df['Inference'] = res_df['p-value'].apply(infer)

res_df.to_csv('../outputs/test_results.csv', index=False)
pd.DataFrame.from_dict(corr_series, orient='index', columns=['Spearman_corr','p-value','n']).to_csv('../outputs/feature_attack_correlation.csv')

print("Hypothesis tests saved to ../outputs/test_results.csv")
print(res_df)
print("\nSpearman correlations saved to ../outputs/feature_attack_correlation.csv")


Hypothesis tests saved to ../outputs/test_results.csv
  Hypothesis                      Description            Test  \
0         H1         Flow Duration difference  Mann-Whitney U   
1         H2  Packets per flow by attack type  Kruskal-Wallis   
2         H3              Variance in Bytes/s          Levene   
3         H4      Protocol vs Attack Relation      Chi-Square   

              Column     Statistic        p-value         nA        nB  \
0      Flow Duration  3.011834e+11   0.000000e+00  2096484.0  425878.0   
1  Total Fwd Packets  3.968631e+05   0.000000e+00        NaN       NaN   
2       Flow Bytes/s  1.235430e+03  1.416549e-270  2096484.0  425878.0   
3           Protocol           NaN            NaN        NaN       NaN   

         Inference  
0      Significant  
1      Significant  
2      Significant  
3  Not Significant  

Spearman correlations saved to ../outputs/feature_attack_correlation.csv
