In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, kruskal, levene, chi2_contingency, spearmanr

In [2]:
# Load cleaned dataset
df = pd.read_csv('../outputs/cleaned_data.csv', low_memory=False)

# 1) Normalize column names (strip/collapse spaces)
df.columns = [(" ".join(str(c).split())).strip() for c in df.columns]

In [3]:
# 2) Ensure label and Attack_Binary exist
label_col = None
for c in df.columns:
    if c.lower().strip() in ('label','class','attack','attack_label'):
        label_col = c
        break
if label_col is None:
    # fallback: try to auto-detect a column with BENIGN in sample
    for c in df.columns:
        if df[c].dtype == object:
            s = df[c].dropna().astype(str).str.upper().head(200)
            if any('BENIGN' in v for v in s):
                label_col = c
                break
if label_col is None:
    raise RuntimeError("Label column not found. Edit notebook to set label_col manually.")
# normalize label values
df[label_col] = df[label_col].astype(str).str.strip().str.upper()
if 'Attack_Binary' not in df.columns:
    df['Attack_Binary'] = df[label_col].apply(lambda x: 'BENIGN' if 'BENIGN' in str(x).upper() else 'ATTACK')

In [4]:
# 3) Coerce needed numeric columns (edit the list if your dataset uses different names)
features = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
            'Flow Bytes/s', 'Flow Packets/s']
for f in features:
    if f in df.columns:
        df[f] = pd.to_numeric(df[f], errors='coerce')

In [5]:
# 4) Helper: safe tests that skip when missing / insufficient data
def safe_mannwhitney(col):
    if col not in df.columns:
        return None, None, 0, 0
    a = df[df['Attack_Binary']=='BENIGN'][col].dropna().astype(float)
    b = df[df['Attack_Binary']=='ATTACK'][col].dropna().astype(float)
    if len(a) < 10 or len(b) < 10:
        return None, None, len(a), len(b)
    stat, p = mannwhitneyu(a, b, alternative='two-sided')
    return stat, p, len(a), len(b)

def safe_kruskal(col):
    if col not in df.columns or label_col not in df.columns:
        return None, None
    groups = [g[col].dropna().astype(float) for _, g in df.groupby(label_col) if len(g[col].dropna()) >= 30]
    if len(groups) < 2:
        return None, None
    stat, p = kruskal(*groups)
    return stat, p

# 5) Run Hypotheses
results = []

In [6]:
# H1: Flow Duration (Benign vs Attack)
stat, p, na, nb = safe_mannwhitney('Flow Duration')
results.append(['H1', 'Flow Duration difference', 'Mann-Whitney U', 'Flow Duration', stat, p, na, nb])

In [7]:
# H2: Packets per flow differ between attack types (Kruskal-Wallis)
stat2, p2 = safe_kruskal('Total Fwd Packets')
results.append(['H2', 'Packets per flow by attack type', 'Kruskal-Wallis', 'Total Fwd Packets', stat2, p2, None, None])

In [8]:
# H3: Bytes per second variance differs (Levene)
if 'Flow Bytes/s' in df.columns:
    a = df[df['Attack_Binary']=='BENIGN']['Flow Bytes/s'].dropna().astype(float)
    b = df[df['Attack_Binary']=='ATTACK']['Flow Bytes/s'].dropna().astype(float)
    if len(a) >= 10 and len(b) >= 10:
        stat3, p3 = levene(a, b)
        results.append(['H3', 'Variance in Bytes/s', 'Levene', 'Flow Bytes/s', stat3, p3, len(a), len(b)])
    else:
        results.append(['H3', 'Variance in Bytes/s', 'Levene', 'Flow Bytes/s', None, None, len(a), len(b)])
else:
    results.append(['H3', 'Variance in Bytes/s', 'Levene', 'Flow Bytes/s', None, None, 0, 0])

In [9]:
# H4: Protocol associated with attack likelihood (Chi-square)
if 'Protocol' in df.columns:
    try:
        cont = pd.crosstab(df['Protocol'].fillna('UNKNOWN'), df['Attack_Binary'])
        stat4, p4, dof, expected = chi2_contingency(cont)
        results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', 'Protocol', stat4, p4, None, None])
    except Exception as e:
        results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', 'Protocol', None, str(e), None, None])
else:
    results.append(['H4', 'Protocol vs Attack Relation', 'Chi-Square', 'Protocol', None, None, None, None])

In [10]:
# H5: Spearman correlation between features and Attack presence
# ensure numeric attack column
df['_attack_num'] = df['Attack_Binary'].map({'BENIGN':0, 'ATTACK':1})
corr_series = {}
for f in features:
    if f in df.columns:
        valid = df[[f, '_attack_num']].dropna()
        if len(valid) >= 50:
            corr, p_corr = spearmanr(valid[f], valid['_attack_num'])
            corr_series[f] = (float(corr), float(p_corr), len(valid))
        else:
            corr_series[f] = (None, None, len(valid))
    else:
        corr_series[f] = (None, None, 0)

In [11]:
# 6) Save and report
res_df = pd.DataFrame(results, columns=['Hypothesis','Description','Test','Column','Statistic','p-value','nA','nB'])
# Inference column: only if p-value numeric
def infer(p):
    try:
        return 'Significant' if float(p) < 0.05 else 'Not Significant'
    except:
        return 'Not Tested/Insufficient Data'
res_df['Inference'] = res_df['p-value'].apply(infer)

res_df.to_csv('../outputs/test_results.csv', index=False)
pd.DataFrame.from_dict(corr_series, orient='index', columns=['Spearman_corr','p-value','n']).to_csv('../outputs/feature_attack_correlation.csv')

print("Hypothesis tests saved to ../outputs/test_results.csv")
print(res_df)
print("\nSpearman correlations saved to ../outputs/feature_attack_correlation.csv")


Hypothesis tests saved to ../outputs/test_results.csv
  Hypothesis                      Description            Test  \
0         H1         Flow Duration difference  Mann-Whitney U   
1         H2  Packets per flow by attack type  Kruskal-Wallis   
2         H3              Variance in Bytes/s          Levene   
3         H4      Protocol vs Attack Relation      Chi-Square   

              Column     Statistic        p-value         nA        nB  \
0      Flow Duration  3.011834e+11   0.000000e+00  2096484.0  425878.0   
1  Total Fwd Packets  3.968631e+05   0.000000e+00        NaN       NaN   
2       Flow Bytes/s  1.235430e+03  1.416549e-270  2096484.0  425878.0   
3           Protocol           NaN            NaN        NaN       NaN   

         Inference  
0      Significant  
1      Significant  
2      Significant  
3  Not Significant  

Spearman correlations saved to ../outputs/feature_attack_correlation.csv
