In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Load dataset
df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")
df.columns = df.columns.str.strip()

# Clean labels
df['Label'] = df['Label'].astype(str).str.strip().str.lower()
df['BinaryLabel'] = df['Label'].apply(lambda x: 'benign' if 'benign' in x else 'malicious')

# Drop non-feature columns and select numeric features
X = df.drop(columns=['Label', 'BinaryLabel'], errors='ignore')
X = X.select_dtypes(include=[np.number])

# Check if features are valid
if X.shape[1] == 0:
    raise ValueError("No numeric features found after dropping label columns.")

# Handle infinite and NaN values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)

# Encode binary labels
le_bin = LabelEncoder()
y_binary = le_bin.fit_transform(df['BinaryLabel'])

# Stage 1: Random Forest for binary classification
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y_binary)

# Feature importance and top 30 feature selection
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
top_30_features = feature_importances.sort_values(ascending=False).head(30).index.tolist()
X_selected = X[top_30_features]

# Plot top 30 features
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances[top_30_features], y=top_30_features)
plt.title("Top 30 Features by Random Forest Importance")
plt.tight_layout()
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Stage 1 Train/Test split
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_selected, y_binary, test_size=0.3, random_state=42)

# Train and evaluate RF binary classifier
rf.fit(X_train_rf, y_train_rf)
y_pred_rf = rf.predict(X_test_rf)
print("Stage 1 (Binary Classification) Results:")
print(classification_report(y_test_rf, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test_rf, y_pred_rf))

# Stage 2: LSTM + XGBoost (Multiclass classification on malicious samples only)
malicious_df = df[df['BinaryLabel'] == 'malicious'].copy()
malicious_X = X.loc[malicious_df.index, top_30_features]
malicious_y = malicious_df['Label']

# Encode multiclass labels
le_multi = LabelEncoder()
y_multi = le_multi.fit_transform(malicious_y)

# Scale features for LSTM
scaler = MinMaxScaler()
malicious_X_scaled = scaler.fit_transform(malicious_X)

# Reshape for LSTM (samples, time_steps=1, features)
X_lstm = malicious_X_scaled.reshape(-1, 1, len(top_30_features))

# Convert labels to categorical for LSTM
y_lstm_cat = to_categorical(y_multi)

# LSTM model
model = Sequential([
    LSTM(64, input_shape=(1, len(top_30_features)), return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(y_lstm_cat.shape[1], activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train LSTM
model.fit(X_lstm, y_lstm_cat, epochs=3, batch_size=512, verbose=1)

# Extract LSTM output as features
intermediate_output = model.predict(X_lstm)

# Split and train XGBoost
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(intermediate_output, y_multi, test_size=0.3, random_state=42)

xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = xgb.predict(X_test_xgb)

# Stage 2 Evaluation
print("\nStage 2 (Multiclass on Malicious Only - LSTM + XGBoost):")
print(classification_report(y_test_xgb, y_pred_xgb, target_names=le_multi.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test_xgb, y_pred_xgb))


  df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")


ValueError: No numeric features found after dropping label columns.

In [3]:
import pandas as pd

df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")
print("✅ Dataset loaded. Shape:", df.shape)
df.head()

  df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")


✅ Dataset loaded. Shape: (19002666, 52)


Unnamed: 0,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,BinaryLabel
0,3,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,benign,Benign
1,109,110091.7431,18348.62385,109.0,0.0,109,109,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,benign,Benign
2,52,230769.2308,38461.53846,52.0,0.0,52,52,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,benign,Benign
3,34,352941.1765,58823.52941,34.0,0.0,34,34,0,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,benign,Benign
4,3,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0,0,benign,Benign


In [4]:
import pandas as pd

df = pd.read_csv("C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/New folder/cicids2017_merged.csv")
print("✅ Dataset loaded. Shape:", df.shape)
df.head()

✅ Dataset loaded. Shape: (2830743, 79)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [5]:
import pandas as pd

df = pd.read_csv("C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/New folder/cicids2018_merged.csv")
print("✅ Dataset loaded. Shape:", df.shape)
df.head()

  df = pd.read_csv("C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/New folder/cicids2018_merged.csv")


✅ Dataset loaded. Shape: (16233002, 84)


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Flow ID,Src IP,Src Port,Dst IP
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,56320859.5,139.300036,56320958,56320761,Benign,,,,
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,56320733.0,114.551299,56320814,56320652,Benign,,,,
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,56319311.5,301.934596,56319525,56319098,Benign,,,,
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,0,0.0,0.0,0,0,Benign,,,,
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,0,0.0,0.0,0,0,Benign,,,,


In [2]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz

# Load both datasets
df_2017 = pd.read_csv("C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/New folder/cicids2017_merged.csv")
df_2018 = pd.read_csv("C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/New folder/cicids2018_merged.csv")

# Normalize column names
df_2017.columns = df_2017.columns.str.strip().str.lower()
df_2018.columns = df_2018.columns.str.strip().str.lower()

# Get column lists
cols_2017 = df_2017.columns.tolist()
cols_2018 = df_2018.columns.tolist()

# Threshold for fuzzy matching
threshold = 90

# Fuzzy match 2017 columns to 2018 columns
matched_cols = {}
used_2018_cols = set()

for col_2017 in cols_2017:
    match, score, _ = process.extractOne(col_2017, cols_2018, scorer=fuzz.token_sort_ratio)
    if score >= threshold and match not in used_2018_cols:
        matched_cols[col_2017] = match
        used_2018_cols.add(match)

# Merge columns with similar meaning
df_2017_renamed = df_2017.rename(columns=matched_cols)

# Create full union of columns
all_columns = sorted(set(df_2017_renamed.columns).union(set(df_2018.columns)))

# Reindex both DataFrames to have all columns (NaN where missing)
df_2017_final = df_2017_renamed.reindex(columns=all_columns)
df_2018_final = df_2018.reindex(columns=all_columns)

# Concatenate the merged DataFrames
df_merged = pd.concat([df_2017_final, df_2018_final], ignore_index=True)

# Save the final merged dataset
df_merged.to_csv("CICIDS2017_2018_Merged_Fuzzy.csv", index=False)
print("Merged dataset saved as CICIDS2017_2018_Merged_Fuzzy.csv")


  df_2018 = pd.read_csv("C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/New folder/cicids2018_merged.csv")


Merged dataset saved as CICIDS2017_2018_Merged_Fuzzy.csv


In [1]:
import pandas as pd

df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")
print("✅ Dataset loaded. Shape:", df.shape)
df.head()

  df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")


✅ Dataset loaded. Shape: (19063745, 118)


Unnamed: 0,ack flag cnt,act_data_pkt_fwd,active max,active mean,active min,active std,average packet size,avg bwd segment size,avg fwd segment size,bwd avg bytes/bulk,...,timestamp,tot bwd pkts,tot fwd pkts,total backward packets,total fwd packets,total length of bwd packets,total length of fwd packets,totlen bwd pkts,totlen fwd pkts,urg flag cnt
0,1,1.0,0,0.0,0,0.0,9.0,0.0,6.0,0.0,...,,,,0.0,2.0,0.0,12.0,,,0
1,1,0.0,0,0.0,0,0.0,9.0,6.0,6.0,0.0,...,,,,1.0,1.0,6.0,6.0,,,1
2,1,0.0,0,0.0,0,0.0,9.0,6.0,6.0,0.0,...,,,,1.0,1.0,6.0,6.0,,,1
3,1,0.0,0,0.0,0,0.0,9.0,6.0,6.0,0.0,...,,,,1.0,1.0,6.0,6.0,,,1
4,1,1.0,0,0.0,0,0.0,9.0,0.0,6.0,0.0,...,,,,0.0,2.0,0.0,12.0,,,0
