In [1]:
#!pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_scatter-2.1.2%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_sparse-0.6.18%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_cluster-1.6.3%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (3.4 M

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random
import os

from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.neighbors import kneighbors_graph


In [4]:
DEFAULT_RANDOM_SEED = 2021

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)

seedEverything(2021)

In [5]:
url = "https://raw.githubusercontent.com/akashvenus/Final_Project/refs/heads/main/sets3/train.csv"
df = pd.read_csv(url)
df

Unnamed: 0.1,Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
0,127724,53,23855,1,1,42,114,42,42,42.000000,...,20,0.0,0.0000,0,0,0.0,0.00000,0,0,BENIGN
1,74669,53108,115,1,1,0,0,0,0,0.000000,...,20,0.0,0.0000,0,0,0.0,0.00000,0,0,BENIGN
2,52277,53,30920,1,1,54,82,54,54,54.000000,...,32,0.0,0.0000,0,0,0.0,0.00000,0,0,BENIGN
3,33733,443,116886492,17,17,946,5030,292,0,55.647059,...,20,392253.5,369954.7324,653851,130656,58000000.0,59459.90212,58000000,57900000,BENIGN
4,66283,21,8806152,9,15,109,188,31,0,12.111111,...,32,0.0,0.0000,0,0,0.0,0.00000,0,0,FTP-Patator
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88467,35942,443,5755364,7,4,599,156,517,0,85.571429,...,32,126402.0,0.0000,126402,126402,5628946.0,0.00000,5628946,5628946,BENIGN
88468,23948,53,47489,2,2,64,532,32,32,32.000000,...,32,0.0,0.0000,0,0,0.0,0.00000,0,0,BENIGN
88469,21149,443,3836920,6,7,1076,394,517,0,179.333333,...,20,0.0,0.0000,0,0,0.0,0.00000,0,0,BENIGN
88470,90942,80,5473398,3,1,0,0,0,0,0.000000,...,32,0.0,0.0000,0,0,0.0,0.00000,0,0,Web Attack � Brute Force


In [6]:
df['label'] = df['label'].apply(lambda x: 0 if x == 'BENIGN' else 1)

In [7]:
feature_cols = df.columns.drop('label')
X = df[feature_cols].values
y = df['label'].values
num_nodes = len(df)

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
train_idx_np, test_idx_np = train_test_split(
    np.arange(num_nodes),
    test_size=0.2,
    stratify=y,
    random_state=DEFAULT_RANDOM_SEED
)

In [10]:
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx_np] = True

In [11]:
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[test_idx_np] = True

In [12]:
k = 5
knn_graph = kneighbors_graph(X_scaled, n_neighbors=k, include_self=False)

row_idx, col_idx = knn_graph.nonzero()
edge_index = torch.tensor([row_idx, col_idx], dtype=torch.long)

# Convert to PyG data
x = torch.tensor(X_scaled, dtype=torch.float)
y_torch = torch.tensor(y, dtype=torch.long)
data = Data(x=x, edge_index=edge_index, y=y_torch)
print(data)

Data(x=[88472, 79], edge_index=[2, 442360], y=[88472])


  edge_index = torch.tensor([row_idx, col_idx], dtype=torch.long)


In [13]:
class GraphNet(nn.Module):
    def __init__(self, in_channels, gcn_hidden, dnn_hidden, out_channels):
        super().__init__()
        # -- GNN layers --
        self.gcn1 = GCNConv(in_channels, gcn_hidden)
        self.gcn2 = GCNConv(gcn_hidden, gcn_hidden)

        # -- DNN layers --
        self.lin1 = nn.Linear(gcn_hidden, dnn_hidden)
        self.lin2 = nn.Linear(dnn_hidden, out_channels)

    def forward(self, x, edge_index):
        # ----- GNN stage -----
        x = self.gcn1(x, edge_index)   # shape: [num_nodes, gcn_hidden]
        x = F.relu(x)
        x = self.gcn2(x, edge_index)   # shape: [num_nodes, gcn_hidden]
        x = F.relu(x)

        # ----- DNN stage -----
        x = self.lin1(x)              # shape: [num_nodes, dnn_hidden]
        x = F.relu(x)
        x = self.lin2(x)              # shape: [num_nodes, out_channels]

        return F.log_softmax(x, dim=1) # for multi-class or binary classification

model = GraphNet(
    in_channels=x.shape[1],  # number of features
    gcn_hidden=128,           # size of GCN hidden embeddings
    dnn_hidden=64,           # size of hidden layer in the MLP
    out_channels=2           # final classes (0=Benign, 1=Attack)
)
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.NLLLoss()

In [14]:
epochs = 80
for epoch in range(1, epochs+1):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)  # shape: [num_nodes, 2]

    # Compute loss on train_mask only
    loss = loss_fn(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()

    # Evaluate on test set
    if epoch % 10 == 0 or epoch == epochs:
        model.eval()
        with torch.no_grad():
            out_test = out[test_mask]
            pred_test = out_test.argmax(dim=1)
            y_true_test = data.y[test_mask]

            acc = accuracy_score(y_true_test.cpu(), pred_test.cpu())
            print(f"Epoch {epoch}/{epochs} | Loss: {loss.item():.4f} | Test Accuracy: {acc:.4f}")


Epoch 10/80 | Loss: 0.1795 | Test Accuracy: 0.9277
Epoch 20/80 | Loss: 0.1163 | Test Accuracy: 0.9538
Epoch 30/80 | Loss: 0.0948 | Test Accuracy: 0.9595
Epoch 40/80 | Loss: 0.0909 | Test Accuracy: 0.9560
Epoch 50/80 | Loss: 0.0784 | Test Accuracy: 0.9665
Epoch 60/80 | Loss: 0.0713 | Test Accuracy: 0.9706
Epoch 70/80 | Loss: 0.0725 | Test Accuracy: 0.9707
Epoch 80/80 | Loss: 0.0668 | Test Accuracy: 0.9726


In [15]:
model.eval()
with torch.no_grad():
    out_all = model(data.x, data.edge_index)
    pred_all = out_all.argmax(dim=1)

    # Evaluate on test_mask
    y_true_test = data.y[test_mask]
    y_pred_test = pred_all[test_mask]

    # Basic metrics
    acc_test  = accuracy_score(y_true_test.cpu(), y_pred_test.cpu())
    prec_test = precision_score(y_true_test.cpu(), y_pred_test.cpu(), average='binary')
    rec_test  = recall_score(y_true_test.cpu(), y_pred_test.cpu(), average='binary')
    f1_test   = f1_score(y_true_test.cpu(), y_pred_test.cpu(), average='binary')

    tn, fp, fn, tp = confusion_matrix(y_true_test.cpu(), y_pred_test.cpu()).ravel()
    fpr_test = fp / (fp + tn)

    print("\nFinal Test Metrics (Node-Level Classification):")
    print("-----------------------------------------------")
    print(f"Accuracy:    {acc_test:.4f}")
    print(f"Precision:   {prec_test:.4f}")
    print(f"Recall:      {rec_test:.4f}")
    print(f"F1-Score:    {f1_test:.4f}")
    print(f"False Pos Rate (FPR): {fpr_test:.4f}")
    print(classification_report(y_true_test.cpu(), y_pred_test.cpu(), target_names=['Benign (0)', 'Attack (1)']))


Final Test Metrics (Node-Level Classification):
-----------------------------------------------
Accuracy:    0.9731
Precision:   0.9457
Recall:      0.9221
F1-Score:    0.9338
False Pos Rate (FPR): 0.0137
              precision    recall  f1-score   support

  Benign (0)       0.98      0.99      0.98     23185
  Attack (1)       0.95      0.92      0.93      6011

    accuracy                           0.97     29196
   macro avg       0.96      0.95      0.96     29196
weighted avg       0.97      0.97      0.97     29196

