In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.utils import class_weight
from tensorflow.keras import layers, models

In [2]:
data = pd.read_csv('Trojan_Detection.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = ['Flow ID', ' Source IP', ' Destination IP', ' Timestamp'] 
label_encoder = LabelEncoder()
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

In [4]:
data['Class'] = data['Class'].map({'Benign': 0, 'Trojan': 1})

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Class
0,73217,46111,7,49975,352,80,6,36269,10743584,4,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,72089,74905,7,49169,895,443,17,39241,254217,6,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,96676,9217,7,37749,7,53,17,42069,1023244,1,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,42891,10418,7,41352,7,53,17,29885,286483,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,169326,20763,5,44353,220,443,6,16589,65633087,12,...,32,322594.0,0.0,322594.0,322594.0,60306983.0,0.0,60306983.0,60306983.0,0


In [6]:
# Drop columns not used for training
X = data.drop(columns=['Class'])
y = data['Class']

In [7]:
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Remove constant features
constant_filter = VarianceThreshold(threshold=0)
X_filtered = constant_filter.fit_transform(X_scaled)

In [9]:
k_best_features = 16  # Example: 16 is a perfect square (4x4)

# Feature selection using SelectKBest
selector = SelectKBest(score_func=f_classif, k=k_best_features)
X_selected = selector.fit_transform(X_filtered, y)

In [10]:
# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]

In [11]:
# Print selected features
print("Selected features:", selected_features)

Selected features: Index(['Unnamed: 0', 'Flow ID', ' Destination IP', ' Timestamp',
       ' Flow Duration', ' Fwd Packet Length Max', 'Fwd IAT Total',
       ' Fwd IAT Max', ' Fwd Header Length', ' Bwd Header Length',
       ' Fwd Avg Bulk Rate', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
       'Subflow Fwd Packets', ' Subflow Bwd Packets', ' Subflow Bwd Bytes'],
      dtype='object')


In [12]:
# Reshape the data for CNN
image_size = int(np.sqrt(k_best_features))  # Assuming we want a square "image"
X_reshaped = X_selected.reshape(X_selected.shape[0], image_size, image_size, 1)

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

In [14]:
# Convert labels to categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [25]:
model = Sequential([
    Input(shape=(image_size, image_size, 1)),
    Conv2D(16, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])


In [26]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [27]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Fit the model
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m3550/3550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.8926 - loss: 0.2584 - val_accuracy: 0.9748 - val_loss: 0.0769
Epoch 2/10
[1m3550/3550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9725 - loss: 0.0807 - val_accuracy: 0.9832 - val_loss: 0.0499
Epoch 3/10
[1m3550/3550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9803 - loss: 0.0594 - val_accuracy: 0.9870 - val_loss: 0.0394
Epoch 4/10
[1m3550/3550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9838 - loss: 0.0509 - val_accuracy: 0.9883 - val_loss: 0.0346
Epoch 5/10
[1m3550/3550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.9850 - loss: 0.0449 - val_accuracy: 0.9892 - val_loss: 0.0325
Epoch 6/10
[1m3550/3550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9870 - loss: 0.0408 - val_accuracy: 0.9907 - val_loss: 0.0253
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x22f61c33b50>

In [169]:
test_loss, test_accuracy = model.evaluate(X_test, y_test_cat)
print("Test Accuracy:", test_accuracy)

[1m1110/1110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9898 - loss: 0.0277
Test Accuracy: 0.9898582696914673
