In [162]:
import gym
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import os

In [181]:
DEFAULT_RANDOM_SEED = 2021

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seedEverything(2021)

In [182]:
# ---------------------------
# Custom Environment
# ---------------------------
class TabularClassificationEnv(gym.Env):
    def __init__(self, df):
        super(TabularClassificationEnv, self).__init__()
        self.df = df.reset_index(drop=True)
        self.current_index = 0

        # Action space: two actions (0 and 1)
        self.action_space = gym.spaces.Discrete(2)
        # Observation space: features (assumes label is dropped)
        num_features = df.shape[1] - 1
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)

    def reset(self):
        self.current_index = 0
        return self._get_observation()

    def step(self, action):
        if self.current_index >= len(self.df):
            return self._get_observation(), 0.0, True, {}

        # Retrieve the true label; assumes column name 'label'
        true_label = self.df.iloc[self.current_index]['label']

        # Modified reward structure:
        # If the sample is positive (label == 1), reward correctly.
        # Penalize misclassification more severely.
        if action == true_label:
            reward = 2.0 if true_label == 0 else 1.0
        else:
            reward = -3.0 if true_label == 1 else -1.0

        self.current_index += 1
        done = self.current_index >= len(self.df)
        if done:
            return self._get_observation(), reward, done, {}
        else:
            return self._get_observation(), reward, done, {}

    def _get_observation(self):
        if self.current_index < len(self.df):
            row = self.df.iloc[self.current_index]
            if 'label' in self.df.columns:
                obs = row.drop('label').values.astype(np.float32)
            else:
                obs = row.values[:-1].astype(np.float32)
            return obs
        else:
            return np.zeros(self.observation_space.shape, dtype=np.float32)

In [183]:
# ---------------------------
# Q-Network Definition
# ---------------------------
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_sizes=(32, 32)):
        super(QNetwork, self).__init__()
        layers = []
        in_dim = input_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())
            in_dim = h
        layers.append(nn.Linear(in_dim, output_dim))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [184]:
# ---------------------------
# DQN Agent
# ---------------------------
class DQNAgent:
    def __init__(self, obs_dim, action_dim, hidden_sizes=(32, 32), lr=1e-3,
                 gamma=0.99, epsilon=0.1, device='cpu'):
        self.q_net = QNetwork(obs_dim, action_dim, hidden_sizes).to(device)
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=lr)
        self.gamma = gamma
        self.epsilon = epsilon
        self.action_dim = action_dim
        self.device = device

    def select_action(self, state):
        # Epsilon-greedy action selection
        if random.random() < self.epsilon:
            return random.randint(0, self.action_dim - 1)
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.q_net(state_tensor)
        return q_values.argmax().item()

    def train_step(self, batch):
        # Unpack a batch of transitions
        states, actions, rewards, next_states, dones = zip(*batch)
        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)

        # Compute current Q-values for taken actions
        q_values = self.q_net(states).gather(1, actions)
        # Compute the next state's maximum Q-value
        next_q_values = self.q_net(next_states).max(1)[0].unsqueeze(1)
        # Compute TD target
        target = rewards + self.gamma * next_q_values * (1 - dones)
        loss = F.mse_loss(q_values, target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

In [185]:
# ---------------------------
# Simple Replay Buffer
# ---------------------------
class ReplayBuffer:
    def __init__(self):
        self.buffer = []

    def add(self, transition):
        self.buffer.append(transition)

    def clear(self):
        self.buffer = []

    def sample_all(self):
        return self.buffer.copy()

    def __len__(self):
        return len(self.buffer)

In [186]:
# ---------------------------
# Training Function
# ---------------------------
def train_agent(agent, env, replay_buffer, num_iterations=1000, batch_size=32, verbose=True):
    state = env.reset()
    print("Training...")
    for i in range(num_iterations):
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.add((state, action, reward, next_state, float(done)))
        state = next_state
        if done:
            state = env.reset()

        # Train only if we have enough samples in the buffer
        if len(replay_buffer) >= batch_size:
            # Sample a mini-batch randomly from the replay buffer
            batch = random.sample(replay_buffer.buffer, batch_size)
            loss = agent.train_step(batch)
            if i % 100 == 0 and verbose:
                  print(f"Iteration {i}, Loss: {loss}")


In [191]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def test_agent(agent, env):
    # Set epsilon to 0 for deterministic behavior during testing
    original_epsilon = agent.epsilon
    agent.epsilon = 0.0

    state = env.reset()
    total_reward = 0
    correct_predictions = 0
    total_predictions = 0
    y_true = []
    y_pred = []
    done = False

    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # After calling step, the environment index is incremented.
        # This allows us to get the true label corresponding to the state from which the action was taken.
        idx = env.current_index - 1 if env.current_index > 0 else 0
        true_label = env.df.iloc[idx]['label']

        y_true.append(true_label)
        y_pred.append(action)

        if action == true_label:
            correct_predictions += 1
        total_predictions += 1

        state = next_state

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    false_positive_rate = FP / (FP + TN) if (FP + TN) > 0 else 0.0

    print(f"Total Reward: {total_reward}, Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"False Positive Rate: {false_positive_rate}")

    # Restore the original epsilon
    agent.epsilon = original_epsilon


In [192]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your DataFrame
df = pd.read_csv('train.csv')

In [193]:
df['label'] = df['label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
df.drop('Unnamed: 0',axis=1,inplace=True)

In [194]:
# Split into training and testing sets (67% train, 33% test)
train_df, test_df = train_test_split(df, test_size=0.33, random_state=DEFAULT_RANDOM_SEED)

features = train_df.drop('label', axis=1).columns
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

In [195]:
train_df.head(10)

Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,fwd packet length std,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
21195,-0.446089,-0.308277,-0.009887,-0.008496,-0.015199,-0.008606,0.383403,-0.320313,0.354924,0.626143,...,0.004105,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
2365,-0.447524,-0.471959,-0.013253,-0.010965,-0.055662,-0.008845,-0.262003,0.200909,-0.166412,-0.260931,...,0.004105,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
40915,-0.310397,-0.472846,-0.013253,-0.013434,-0.059769,-0.009009,-0.29355,-0.216069,-0.287594,-0.260931,...,0.004105,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
72727,-0.447524,-0.472153,-0.013253,-0.010965,-0.052754,-0.008864,-0.239657,0.496267,-0.080574,-0.260931,...,0.00411,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
19230,-0.447524,-0.472842,-0.013253,-0.010965,-0.054636,-0.008932,-0.254116,0.305153,-0.136116,-0.260931,...,0.00411,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
39504,-0.447524,-0.432069,-0.014936,-0.0122,-0.056261,-0.008891,-0.23177,0.600512,-0.050279,-0.260931,...,0.004105,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
11632,-0.446089,-0.472505,-0.009887,-0.007261,-0.030768,-0.002405,0.159943,-0.320313,0.125183,0.341283,...,0.00411,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,1
20569,-0.447524,-0.470661,-0.013253,-0.010965,-0.053609,-0.008782,-0.246229,0.409397,-0.105821,-0.260931,...,0.004105,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0
14324,-0.426796,-0.310773,-0.003155,-0.007261,-0.02786,-0.007224,-0.033285,-0.320313,-0.074894,-0.001317,...,0.004105,0.240467,-0.12586,0.115644,0.29347,-0.153256,-0.127538,-0.168469,-0.134445,0
27909,-0.447524,-0.471112,-0.009887,-0.010965,-0.046081,-0.008896,-0.244914,0.426771,-0.100771,-0.260931,...,0.004105,-0.128337,-0.12586,-0.163884,-0.10152,-0.370582,-0.127538,-0.378319,-0.354375,0


In [196]:
# Create training and testing environments
train_env = TabularClassificationEnv(train_df)
test_env = TabularClassificationEnv(test_df)

# Get observation and action dimensions from the training environment
obs_dim = train_env.observation_space.shape[0]
action_dim = train_env.action_space.n

# Instantiate the DQN Agent and Replay Buffer
agent = DQNAgent(obs_dim, action_dim, hidden_sizes=(128, 128), lr=0.001,
                 gamma=0.99, epsilon=0.1, device='cpu')
replay_buffer = ReplayBuffer()

In [197]:
# Train the agent on the training environment
train_agent(agent, train_env, replay_buffer, num_iterations=4000, verbose=False)

# Test the agent on the testing environment
test_agent(agent, test_env)

Training...
Total Reward: 47357.0, Accuracy: 0.9478010686395396
Precision: 0.8334309991210079
Recall: 0.9363067807768268
F1 Score: 0.881878778483956
False Positive Rate: 0.04917820069204152
