In [1]:
import pandas as pd
import numpy as np
import requests
import zipfile
import os
from io import BytesIO, StringIO
from sklearn.model_selection import train_test_split
import kagglehub
import random
from sklearn.utils import resample

In [2]:
# Setting random seed to replicate results
DEFAULT_RANDOM_SEED = 2021

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)

seedEverything(2021)

In [3]:
# Download latest version
path = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/chethuhn/network-intrusion-dataset?dataset_version_number=1...


100%|██████████| 230M/230M [00:03<00:00, 64.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/chethuhn/network-intrusion-dataset/versions/1


In [4]:
def load_and_preprocess_data(path):
    all_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".csv")]
    df_list = [pd.read_csv(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)

    df.columns = df.columns.str.strip().str.lower()

    # Remove infinite values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Drop rows with missing values and duplicates
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    del df_list, all_files

    return df

In [5]:
df = load_and_preprocess_data(path)

In [6]:
# Free up RAM
import gc
gc.collect()

9

In [24]:
subset_percentage = 0.1
min_sample_size = 1000
subset_size = max(int(len(df) * subset_percentage), min_sample_size)

subset_list = []
for class_name, class_data in df.groupby('label'):
    class_proportion = len(class_data) / len(df)
    class_sample_size = min(max(int(subset_size * class_proportion), min_sample_size), len(class_data))
    sampled_data = resample(class_data, n_samples=class_sample_size, random_state=DEFAULT_RANDOM_SEED, replace=False)
    subset_list.append(sampled_data)

subset_df = pd.concat(subset_list).sample(frac=1, random_state=DEFAULT_RANDOM_SEED).reset_index(drop=True)

In [25]:
# Perform train-test split
train_df, test_df = train_test_split(subset_df, test_size=0.33,random_state=DEFAULT_RANDOM_SEED)

# Display dataset shapes
print("Train Dataset Shape:", train_df.shape)
print("Test Dataset Shape:", test_df.shape)

# Show first few rows of training data
train_df.head()

Train Dataset Shape: (171792, 79)
Test Dataset Shape: (84615, 79)


Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,fwd packet length std,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
87113,443,3528693,12,11,999,3950,267,0,83.25,95.785864,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
74766,443,2569280,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
204834,61478,89,1,2,6,12,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
160785,65091,79,1,3,0,18,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
236828,53,214,2,2,68,484,34,34,34.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [26]:
github_url = "https://github.com/akashvenus/Final_Project/raw/main/data.csv"
response = requests.get(github_url)

github_df = pd.read_csv(StringIO(response.text))
github_df.drop('Unnamed: 0',axis=1,inplace=True)
github_df.columns = github_df.columns.str.strip().str.lower()


# Combine with test_df
combined_test_df = pd.concat([test_df, github_df], ignore_index=True)
print("Updated Combined Test Dataset Shape:", combined_test_df.shape)

Updated Combined Test Dataset Shape: (94615, 79)


In [28]:
test_df.head(5)

Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,fwd packet length std,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
49358,53,73193,2,2,104,220,52,52,52.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
140585,443,117100929,15,15,1065,4598,406,0,71.0,109.846646,...,20,193702.5,151651.07,300936,86469,58300000.0,302136.8281,58500000,58100000,BENIGN
66279,53,188,2,2,150,278,75,75,75.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
94987,80,98226771,9,6,331,11595,331,0,36.777778,110.333333,...,32,2002.0,0.0,2002,2002,98200000.0,0.0,98200000,98200000,DoS Hulk
155916,80,1228303,3,2,538,133,538,0,179.333333,310.614445,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [29]:
github_df.head(5)

Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,fwd packet length std,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
0,50,59355,12,4,41,0,740,0,4.042365,0.790155,...,-282,0.0,0.0,0,513,66744.54,0.0,0,28865,SSH-Patator
1,52,30580407,7,5,452,6605,392,0,123.025224,171.642192,...,-702,1201.57023,0.0,0,1652,10623130.0,4498.683927,20514533,5997158,DoS GoldenEye
2,112,102992405,2,2,35,0,18,6,6.49313,0.0,...,-745,659.316284,0.0,0,1311,101215100.0,6542.295628,98763141,100435419,DoS slowloris
3,453,60682560,1,13,53,806,32,1,37.33318,13.381788,...,-409,101479.907127,0.0,23399,44,69326520.0,2442.130102,58719465,75697953,BENIGN
4,29,6332515,13,11,92,1693,29,1,7.817241,7.635458,...,-451,0.0,0.0,1364,207,71431.09,0.0,0,39215,FTP-Patator


In [30]:
combined_test_df.head(10)

Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,fwd packet length std,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
0,53,73193,2,2,104,220,52,52,52.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,443,117100929,15,15,1065,4598,406,0,71.0,109.846646,...,20,193702.5,151651.07,300936,86469,58300000.0,302136.8281,58500000,58100000,BENIGN
2,53,188,2,2,150,278,75,75,75.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,98226771,9,6,331,11595,331,0,36.777778,110.333333,...,32,2002.0,0.0,2002,2002,98200000.0,0.0,98200000,98200000,DoS Hulk
4,80,1228303,3,2,538,133,538,0,179.333333,310.614445,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
5,80,1398091,6,5,1228,1272,615,0,204.666667,317.068867,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
6,53,162,2,2,72,196,36,36,36.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
7,62152,1111002,1,6,1375,30,1375,1375,1375.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
8,443,5294767,9,6,350,4946,193,0,38.888889,71.129186,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
9,53,396,2,2,62,164,31,31,31.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [31]:
train_df.to_csv("train.csv")

In [32]:
test_df.to_csv("test.csv")

In [33]:
combined_test_df.to_csv("combined_test_df.csv")