In [3]:
!pip install -q imbalanced-learn sentence-transformers faiss-cpu torch scikit-learn transformers pandas numpy -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
# STEP 1 — Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sentence_transformers import SentenceTransformer
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [5]:
!wget -nc -O MachineLearningCVE.zip http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip

# Download MachineLearningCSV.md5 file to check the integrity of the downloaded file.
!wget -nc http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.md5

# Checking the file integrity.
!md5sum -c MachineLearningCSV.md5

--2025-10-28 21:47:12--  http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip
Connecting to 205.174.165.80:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 235102953 (224M) [application/zip]
Saving to: ‘MachineLearningCVE.zip’


2025-10-28 21:47:29 (12.8 MB/s) - ‘MachineLearningCVE.zip’ saved [235102953/235102953]

--2025-10-28 21:47:29--  http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.md5
Connecting to 205.174.165.80:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57
Saving to: ‘MachineLearningCSV.md5’


2025-10-28 21:47:30 (5.57 MB/s) - ‘MachineLearningCSV.md5’ saved [57/57]

MachineLearningCVE.zip: OK


In [6]:
# Save the zip and extracted files to Google Drive at CICIDS2017 folder.
!mkdir -p "/content/drive/My Drive/CICIDS2017/"

!cp MachineLearningCVE.zip "/content/drive/My Drive/CICIDS2017/"

In [7]:
# Unzip files
!unzip -n "/content/drive/My Drive/CICIDS2017/MachineLearningCVE.zip"

Archive:  /content/drive/My Drive/CICIDS2017/MachineLearningCVE.zip
   creating: MachineLearningCVE/
  inflating: MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv  


In [2]:
# STEP 2 — Load & Merge All CSV Files
csv_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

data_list = []
for f in csv_files:
    path = os.path.join("MachineLearningCVE", f)
    df = pd.read_csv(path)
    df.rename(columns={col: col.strip() for col in df.columns}, inplace=True)
    data_list.append(df)

data = pd.concat(data_list, ignore_index=True)
print(f"Full dataset shape: {data.shape}")

Full dataset shape: (2830743, 79)


In [3]:
# STEP 3 — Basic Cleanup
data.drop_duplicates(inplace=True)

# Replace inf/-inf with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Get numeric columns
numeric_cols = data.select_dtypes(include=np.number).columns

for col in numeric_cols:
    # Fill NaNs with median safely
    data[col] = data[col].fillna(data[col].median())

    # Convert types
    if np.issubdtype(data[col].dtype, np.floating):
        data[col] = data[col].astype(np.float32)
    elif np.issubdtype(data[col].dtype, np.integer):
        data[col] = data[col].astype(np.int32)

# Remove constant columns
data = data[[col for col in data.columns if data[col].nunique() > 1]]

print(f"Dataset shape after cleaning: {data.shape}")


Dataset shape after cleaning: (2522362, 71)


In [4]:
# STEP 4 — Encode label columns
if 'Label' in data.columns:
    data['Label_encoded'] = data['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


In [5]:
# STEP 5 — Train/Test split
X = data.drop('Label', axis=1)
y = data['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training size: {X_train.shape}, Test size: {X_test.shape}")


Training size: (2017889, 71), Test size: (504473, 71)


In [6]:
# STEP 6 - Compute class weights for memory-efficient training
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
print("Class weights:", class_weight_dict)

Class weights: {'BENIGN': np.float64(0.08020926308952629), 'Bot': np.float64(86.12415706359369), 'DDoS': np.float64(1.3135630567733914), 'DoS GoldenEye': np.float64(16.347786284279174), 'DoS Hulk': np.float64(0.9728587372871754), 'DoS Slowhttptest': np.float64(32.16784632552208), 'DoS slowloris': np.float64(31.22700402352213), 'FTP-Patator': np.float64(28.34511869644613), 'Heartbleed': np.float64(14947.325925925927), 'Infiltration': np.float64(4638.8252873563215), 'PortScan': np.float64(1.8515715825935357), 'SSH-Patator': np.float64(52.24308090614887), 'Web Attack � Brute Force': np.float64(114.39280045351474), 'Web Attack � Sql Injection': np.float64(7913.290196078431), 'Web Attack � XSS': np.float64(257.7125159642401)}


In [7]:
# STEP 7 - Scale features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [17]:
print(data.columns.tolist())


['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packe

In [11]:
# STEP 8 — Create log_text for LLM / RAG
def row_to_log_text(row):
    protocol = row.get('Protocol', 'UNKNOWN')
    flow_duration_min = row.get('Flow Duration', 0) / 60
    fwd_pkt = row.get('Total Fwd Packets', 0)
    bwd_pkt = row.get('Total Backward Packets', 0)
    fwd_len = row.get('Fwd Packet Length Mean', 0)
    bwd_len = row.get('Bwd Packet Length Mean', 0)
    flow_pkts_s = row.get('Flow Packets/s', 0)
    flow_bytes_s = row.get('Flow Bytes/s', 0)
    syn = row.get('SYN Flag Count', 0)
    fin = row.get('FIN Flag Count', 0)
    ack = row.get('ACK Flag Count', 0)
    rst = row.get('RST Flag Count', 0)

    return (
        f"Flow Summary:\n"
        f"- Protocol: {protocol}\n"
        f"- Destination Port: {row.get('Destination Port','UNKNOWN')}\n"
        f"- Flow Duration: {flow_duration_min:.2f} minutes\n"
        f"- Packets (Fwd/Bwd): {fwd_pkt}/{bwd_pkt}\n"
        f"- Avg Packet Length (Fwd/Bwd): {fwd_len:.1f}/{bwd_len:.1f} bytes\n"
        f"- Flow Rate: {flow_pkts_s:.2f} pkts/s, {flow_bytes_s:.2f} bytes/s\n"
        f"- Key Flags: SYN={syn}, FIN={fin}, ACK={ack}, RST={rst}\n"
        f"- Label: {row.get('Label','UNKNOWN')}\n"
    )

train_logs = pd.concat([X_train.copy(), y_train.copy()], axis=1)
test_logs  = pd.concat([X_test.copy(), y_test.copy()], axis=1)

train_logs['log_text'] = train_logs.apply(row_to_log_text, axis=1)
test_logs['log_text']  = test_logs.apply(row_to_log_text, axis=1)


train_logs[['log_text','Label']].to_csv("train_logs.csv", index=False)
test_logs[['log_text','Label']].to_csv("test_logs.csv", index=False)

print("LLM log_text datasets saved:")
print(" - train_logs.csv")
print(" - test_logs.csv")

LLM log_text datasets saved:
 - train_logs.csv
 - test_logs.csv


In [12]:
# Print first rows of each
print("=== First row of train_logs.csv ===")
print(train_logs.head(1))

=== First row of train_logs.csv ===
         Destination Port  Flow Duration  Total Fwd Packets  \
1133629                80          14207                  2   

         Total Backward Packets  Total Length of Fwd Packets  \
1133629                       2                           12   

         Total Length of Bwd Packets  Fwd Packet Length Max  \
1133629                            0                      6   

         Fwd Packet Length Min  Fwd Packet Length Mean  Fwd Packet Length Std  \
1133629                      6                     6.0                    0.0   

         ...  Active Std  Active Max  Active Min  Idle Mean  Idle Std  \
1133629  ...         0.0           0           0        0.0       0.0   

         Idle Max  Idle Min  Label_encoded     Label  \
1133629         0         0              1  DoS Hulk   

                                                  log_text  
1133629  Flow Summary:\n- Protocol: UNKNOWN\n- Destinat...  

[1 rows x 73 columns]


In [13]:
print(train_logs['log_text'].iloc[0])

Flow Summary:
- Protocol: UNKNOWN
- Destination Port: 80
- Flow Duration: 236.78 minutes
- Packets (Fwd/Bwd): 2/2
- Avg Packet Length (Fwd/Bwd): 6.0/0.0 bytes
- Flow Rate: 281.55 pkts/s, 844.65 bytes/s
- Key Flags: SYN=0, FIN=1, ACK=0, RST=0
- Label: DoS Hulk



In [14]:
print("\n=== First row of test_logs.csv ===")
print(test_logs.head(1))


=== First row of test_logs.csv ===
         Destination Port  Flow Duration  Total Fwd Packets  \
2638407                80        5011280                  5   

         Total Backward Packets  Total Length of Fwd Packets  \
2638407                       0                           30   

         Total Length of Bwd Packets  Fwd Packet Length Max  \
2638407                            0                      6   

         Fwd Packet Length Min  Fwd Packet Length Mean  Fwd Packet Length Std  \
2638407                      6                     6.0                    0.0   

         ...  Active Std  Active Max  Active Min  Idle Mean  Idle Std  \
2638407  ...         0.0        8003        8003  5003277.0       0.0   

         Idle Max  Idle Min  Label_encoded  Label  \
2638407   5003277   5003277              1   DDoS   

                                                  log_text  
2638407  Flow Summary:\n- Protocol: UNKNOWN\n- Destinat...  

[1 rows x 73 columns]


In [15]:
print(train_logs['log_text'].iloc[0])

Flow Summary:
- Protocol: UNKNOWN
- Destination Port: 80
- Flow Duration: 236.78 minutes
- Packets (Fwd/Bwd): 2/2
- Avg Packet Length (Fwd/Bwd): 6.0/0.0 bytes
- Flow Rate: 281.55 pkts/s, 844.65 bytes/s
- Key Flags: SYN=0, FIN=1, ACK=0, RST=0
- Label: DoS Hulk



In [None]:
import pandas as pd

train_logs = pd.read_csv("train_logs.csv")
test_logs = pd.read_csv("test_logs.csv")

train_logs.to_parquet("train_logs.parquet", compression="gzip")
test_logs.to_parquet("test_logs.parquet", compression="gzip")

import os
print("Compressed sizes:")
print("train_logs.parquet:", round(os.path.getsize("train_logs.parquet") / 1024 / 1024, 2), "MB")
print("test_logs.parquet:", round(os.path.getsize("test_logs.parquet") / 1024 / 1024, 2), "MB")


In [16]:
# Download log files
from google.colab import files

# files.download('train_logs.parquet')
# files.download('test_logs.parquet')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# Break down parquet to store in Github
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import os

def split_parquet(file_path, chunk_size_mb=20):
    """
    Split a large Parquet file into chunks smaller than chunk_size_mb.
    """
    # Load the Parquet file
    df = pd.read_parquet(file_path)
    total_rows = len(df)
    
    # Estimate rows per chunk (approximate)
    sample = df.head(1000)
    approx_row_size = sample.memory_usage(index=True, deep=True).sum() / 1000
    rows_per_chunk = int((chunk_size_mb * 1024 * 1024) / approx_row_size)

    base_name = os.path.splitext(file_path)[0]
    os.makedirs(base_name, exist_ok=True)

    print(f"Splitting {file_path} into ~{chunk_size_mb} MB chunks...")
    for i in range(0, total_rows, rows_per_chunk):
        chunk = df.iloc[i:i + rows_per_chunk]
        out_file = f"{base_name}/part_{i//rows_per_chunk + 1}.parquet"
        chunk.to_parquet(out_file, index=False)
        print(f"→ Saved {out_file} ({len(chunk)} rows)")

# Example: split both files
split_parquet("train_logs.parquet")
split_parquet("test_logs.parquet")


Splitting train_logs.parquet into ~20 MB chunks...
→ Saved train_logs/part_1.parquet (54617 rows)
→ Saved train_logs/part_2.parquet (54617 rows)
→ Saved train_logs/part_3.parquet (54617 rows)
→ Saved train_logs/part_4.parquet (54617 rows)
→ Saved train_logs/part_5.parquet (54617 rows)
→ Saved train_logs/part_6.parquet (54617 rows)
→ Saved train_logs/part_7.parquet (54617 rows)
→ Saved train_logs/part_8.parquet (54617 rows)
→ Saved train_logs/part_9.parquet (54617 rows)
→ Saved train_logs/part_10.parquet (54617 rows)
→ Saved train_logs/part_11.parquet (54617 rows)
→ Saved train_logs/part_12.parquet (54617 rows)
→ Saved train_logs/part_13.parquet (54617 rows)
→ Saved train_logs/part_14.parquet (54617 rows)
→ Saved train_logs/part_15.parquet (54617 rows)
→ Saved train_logs/part_16.parquet (54617 rows)
→ Saved train_logs/part_17.parquet (54617 rows)
→ Saved train_logs/part_18.parquet (54617 rows)
→ Saved train_logs/part_19.parquet (54617 rows)
→ Saved train_logs/part_20.parquet (54617 rows