In [6]:
import os
import pandas as pd
import kagglehub
from tqdm import tqdm
import numpy as np
import zipfile
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
# Path to the cached dataset
cache_path = os.path.expanduser("~/.cache/kagglehub/datasets/mryanm/luflow-network-intrusion-detection-data-set/versions/240")

if not os.path.exists(cache_path):
    # Download latest version
    cache_path = kagglehub.dataset_download("mryanm/luflow-network-intrusion-detection-data-set")

# List to store dataframes
df_list = []

# Load and optimize each CSV
for year in tqdm(sorted(os.listdir(cache_path))):  # Sorting for consistency
    year_path = os.path.join(cache_path, year)
    if os.path.isdir(year_path):
        for month in sorted(os.listdir(year_path)):
            month_path = os.path.join(year_path, month)
            if os.path.isdir(month_path):
                for day in sorted(os.listdir(month_path)):
                    day_path = os.path.join(month_path, day)
                    if os.path.isdir(day_path):
                        for file in os.listdir(day_path):
                            if file.endswith(".csv"):
                                full_path = os.path.join(day_path, file)
                                data = pd.read_csv(full_path)

                                # Extract date info safely
                                y, m, d = map(int, file.split(".")[:3])
                                data["Year"] = y
                                data["Month"] = m
                                data["Day"] = d

                                # Reduce memory usage BEFORE adding to list
                                data = reduce_mem_usage(data, verbose=False)
                                df_list.append(data)

# Final concatenation
df = pd.concat(df_list, ignore_index=True)

# Memory summary
print(f"Final dataset memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
print(df.info())  # Quick check

100%|██████████| 4/4 [02:18<00:00, 34.66s/it] 


Final dataset memory usage: 15365.04 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206556547 entries, 0 to 206556546
Data columns (total 19 columns):
 #   Column         Dtype  
---  ------         -----  
 0   avg_ipt        float32
 1   bytes_in       int32  
 2   bytes_out      int32  
 3   dest_ip        int32  
 4   dest_port      float64
 5   entropy        float16
 6   num_pkts_out   int16  
 7   num_pkts_in    int16  
 8   proto          int16  
 9   src_ip         int32  
 10  src_port       float64
 11  time_end       int64  
 12  time_start     int64  
 13  total_entropy  float32
 14  label          object 
 15  duration       float16
 16  Year           int16  
 17  Month          int8   
 18  Day            int8   
dtypes: float16(2), float32(2), float64(2), int16(4), int32(4), int64(2), int8(2), object(1)
memory usage: 15.0+ GB
None


The full dataset is too large to get description stats in a timely manner, so only inspecting a subset. NaN seen for some mean values due to overflow during summation, a consequence of downcasting as we've done.

In [4]:
df[df['Year'] == 2022].describe()

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,avg_ipt,bytes_in,bytes_out,dest_ip,dest_port,entropy,num_pkts_out,num_pkts_in,proto,src_ip,src_port,time_end,time_start,total_entropy,duration,Year,Month,Day
count,1068376.0,1068376.0,1068376.0,1068376.0,964168.0,1068376.0,1068376.0,1068376.0,1068376.0,1068376.0,964168.0,1068376.0,1068376.0,1068376.0,1068376.0,1068376.0,1068376.0,1068376.0
mean,4964985.0,699.9835,3620.235,786.0,14856.036961,,7.804426,4.931449,5.60987,786.0,36898.051885,1504349000000000.0,1505360000000000.0,13035.08,,2022.0,6.0,13.52783
std,84310510.0,3288.582,8257.19,0.0,16039.005055,0.0,21.69485,14.91951,1.870952,0.0,17895.153177,452106800000000.0,450745200000000.0,65941.2,0.0,0.0,0.0,0.5460865
min,0.0,0.0,0.0,786.0,1.0,0.0,0.0,0.0,1.0,786.0,11.0,16550760000.0,16550890000.0,0.0,0.0,2022.0,6.0,12.0
25%,0.0,0.0,0.0,786.0,5900.0,1.020508,1.0,0.0,6.0,786.0,19780.0,1655092000000000.0,1655092000000000.0,35.01955,0.0,2022.0,6.0,13.0
50%,0.0,0.0,43.0,786.0,9200.0,3.0,3.0,1.0,6.0,786.0,45332.0,1655163000000000.0,1655163000000000.0,323.8136,0.0001959801,2022.0,6.0,14.0
75%,35.71429,34.0,2904.0,786.0,9300.0,5.019531,7.0,5.0,6.0,786.0,47613.0,1655182000000000.0,1655182000000000.0,20671.65,0.2279053,2022.0,6.0,14.0
max,4294967000.0,65483.0,65535.0,786.0,65535.0,134.25,255.0,255.0,47.0,786.0,65535.0,1655251000000000.0,1655251000000000.0,3979174.0,41.03125,2022.0,6.0,14.0


From the GitHub https://github.com/ruzzzzz/luflow

#### src_ip:
The source IP address associated with the flow. This feature is anonymised to the corresponding Autonomous System

#### src_port:
The source port number associated with the flow.

#### dest_ip:
The destination IP address associated with the flow. The feature is also anonymised in the same manner as before.

#### dest_port:
The destination port number associated with the flow

#### protocol:
The protocol number associated with the flow. For example TCP is 6

#### bytes_in:
The number of bytes transmitted from source to destination

#### bytes_out:
The number of bytes transmitted from destination to source.

#### num_pkts_in:
The packet count from source to destination

#### num_pkts_out:
The packet count from destination to source

#### entropy:
The entropy in bits per byte of the data fields within the flow. This number ranges from 0 to 8.

#### total_entropy:
The total entropy in bytes over all of the bytes in the data fields of the flow

#### mean_ipt:
The mean of the inter-packet arrival times of the flow

#### time_start:
The start time of the flow in seconds since the epoch.

#### time_end:
The end time of the flow in seconds since the epoch

#### duration:
The flow duration time, with microsecond precision

#### label:
The label of the flow, as decided by Tangerine. Either benign, outlier, or malicious

Datasets from https://staff.itee.uq.edu.au/marius/NIDS_datasets/#RA6

In [3]:
# unzip the file
with zipfile.ZipFile('fe6cb615d161452c_MOHANAD_A4706.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
nf_path = 'data/fe6cb615d161452c_MOHANAD_A4706/data/NF-UNSW-NB15-v2.csv'
df_nf = pd.read_csv(nf_path)
df_nf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390275 entries, 0 to 2390274
Data columns (total 45 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   IPV4_SRC_ADDR                object 
 1   L4_SRC_PORT                  int64  
 2   IPV4_DST_ADDR                object 
 3   L4_DST_PORT                  int64  
 4   PROTOCOL                     int64  
 5   L7_PROTO                     float64
 6   IN_BYTES                     int64  
 7   IN_PKTS                      int64  
 8   OUT_BYTES                    int64  
 9   OUT_PKTS                     int64  
 10  TCP_FLAGS                    int64  
 11  CLIENT_TCP_FLAGS             int64  
 12  SERVER_TCP_FLAGS             int64  
 13  FLOW_DURATION_MILLISECONDS   int64  
 14  DURATION_IN                  int64  
 15  DURATION_OUT                 int64  
 16  MIN_TTL                      int64  
 17  MAX_TTL                      int64  
 18  LONGEST_FLOW_PKT             int64  
 19  

In [5]:
df_nf.describe()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
count,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,...,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0,2390275.0
mean,32549.61,11423.09,9.151913,3.038721,4621.664,36.03686,35265.65,45.70363,19.89969,19.61149,...,23.82468,17387.63,12221.6,18944.43,74.00138,4949.701,2.970099,27256.37,37.65351,0.03976655
std,19162.12,18539.15,11.01865,13.58876,70214.09,79.95682,156515.5,117.4495,11.13381,11.05082,...,111.4788,16930.17,10975.92,20371.2,79.57477,13780.88,124.034,8958054.0,84.30295,0.1954103
min,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15831.0,25.0,6.0,0.0,481.0,4.0,312.0,4.0,19.0,19.0,...,0.0,0.0,5792.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,32650.0,80.0,6.0,0.0,1684.0,18.0,2456.0,18.0,27.0,27.0,...,0.0,14480.0,13032.0,11008.0,43.0,0.0,0.0,0.0,0.0,0.0
75%,49076.0,15887.0,6.0,1.0,3286.0,44.0,14968.0,44.0,27.0,27.0,...,12.0,24616.0,14480.0,33792.0,132.0,0.0,0.0,0.0,0.0,0.0
max,65535.0,65535.0,255.0,248.0,30241410.0,22894.0,14658520.0,11078.0,31.0,31.0,...,19676.0,65533.0,65531.0,65317.0,255.0,65535.0,55937.0,4283358000.0,501.0,1.0


In [7]:
# Base URL of the dataset
BASE_URL = "http://malnet.cc.gatech.edu/image-data/"
SAVE_DIR = "data/MalNet_Dataset"

def ensure_directory(path):
    """Ensure the directory exists."""
    if not os.path.exists(path):
        os.makedirs(path)

def get_links(url):
    """Get all links (subdirectories and files) from a given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to access {url}")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    links = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and href not in ("../", "/"):  # Ignore parent directory links
            full_url = urljoin(url, href)
            links.append(full_url)
    return links

def download_file(url, save_path):
    """Download a file with a progress bar."""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    with open(save_path, "wb") as file, tqdm(
        desc=os.path.basename(save_path),
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            bar.update(len(data))

def crawl_and_download(url, save_path):
    """Recursively crawl and download all files from a directory URL."""
    ensure_directory(save_path)
    links = get_links(url)
    
    for link in links:
        parsed = urlparse(link)
        if parsed.path.endswith("/") and "6GB" not in parsed.path:  # If it's a directory, recurse. Exclude the 6GB directory
            subdir_name = os.path.basename(os.path.normpath(parsed.path))
            crawl_and_download(link, os.path.join(save_path, subdir_name))
        else:  # Otherwise, it's a file
            filename = os.path.basename(parsed.path)
            file_path = os.path.join(save_path, filename)
            if not os.path.exists(file_path):  # Avoid re-downloading
                download_file(link, file_path)

        time.sleep(1) # Be polite

# Start crawling and downloading
crawl_and_download(BASE_URL, SAVE_DIR)

print("Download complete!")

malnet-image00: 100%|██████████| 0.98G/0.98G [01:01<00:00, 16.9MB/s]
malnet-image01: 100%|██████████| 0.98G/0.98G [01:42<00:00, 10.2MB/s]
malnet-image02: 100%|██████████| 0.98G/0.98G [01:51<00:00, 9.45MB/s]
malnet-image03: 100%|██████████| 0.98G/0.98G [01:39<00:00, 10.6MB/s]
malnet-image04: 100%|██████████| 0.98G/0.98G [01:49<00:00, 9.60MB/s]
malnet-image05: 100%|██████████| 0.98G/0.98G [01:47<00:00, 9.72MB/s]
malnet-image06: 100%|██████████| 0.98G/0.98G [01:42<00:00, 10.2MB/s]
malnet-image07: 100%|██████████| 0.98G/0.98G [01:41<00:00, 10.3MB/s]
malnet-image08: 100%|██████████| 0.98G/0.98G [01:40<00:00, 10.4MB/s]
malnet-image09: 100%|██████████| 0.98G/0.98G [01:39<00:00, 10.5MB/s]
malnet-image10: 100%|██████████| 0.98G/0.98G [01:39<00:00, 10.5MB/s]
malnet-image11: 100%|██████████| 0.98G/0.98G [01:38<00:00, 10.7MB/s]
malnet-image12:  27%|██▋       | 275M/0.98G [00:31<01:23, 9.13MB/s] 


KeyboardInterrupt: 