In [7]:
import pandas as pd
import numpy as np

In [8]:
data = pd.read_csv("TrojanDetection.csv")

In [9]:
# prompt: drop columns which have more than 80% data as zero

# get the percentage of zeros in each column
zeros_percent = (data == 0).mean(axis=0) * 100

# select columns with more than 80% zeros
drop_columns = zeros_percent[zeros_percent > 60].index.tolist()

# drop the selected columns
data1 = data.drop(labels=drop_columns, axis=1)

# print the updated data
data1.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward,Class
0,73217,10.42.0.42-121.14.255.84-49975-80-6,10.42.0.42,49975,121.14.255.84,80,6,17-07-2017 01:18,10743584,4,...,168.0,100,4,372,4,672,65535,511,20,Trojan
1,72089,172.217.6.226-10.42.0.42-443-49169-17,10.42.0.42,49169,172.217.6.226,443,17,17-07-2017 10:25,254217,6,...,749.428571,204,6,3191,7,5246,-1,-1,20,Trojan
2,96676,10.42.0.1-10.42.0.42-53-37749-17,10.42.0.42,37749,10.42.0.1,53,17,30-06-2017 07:16,1023244,1,...,179.0,32,1,30,1,179,-1,-1,32,Benign
3,42891,10.42.0.1-10.42.0.42-53-41352-17,10.42.0.42,41352,10.42.0.1,53,17,13-07-2017 03:48,286483,1,...,106.0,20,1,40,1,106,-1,-1,20,Trojan
4,169326,10.42.0.151-107.22.241.77-44353-443-6,10.42.0.151,44353,107.22.241.77,443,6,05-07-2017 10:47,65633087,12,...,562.2,392,12,767,10,5622,65535,79,32,Benign


In [10]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = ['Flow ID', ' Source IP', ' Destination IP', ' Timestamp'] 
label_encoder = LabelEncoder()

for column in categorical_columns:
    data1.loc[:, column] = label_encoder.fit_transform(data2[column])

In [13]:
data1['Class'] = data1['Class'].astype(str).str.lower()
data2 = data1.copy()
data2['Class'] = data2['Class'].apply(lambda x: 1 if x == 'trojan' else 0)
data2.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward,Class
0,73217,46111,7,49975,352,80,6,1967,10743584,4,...,168.0,100,4,372,4,672,65535,511,20,1
1,72089,74905,7,49169,895,443,17,2114,254217,6,...,749.428571,204,6,3191,7,5246,-1,-1,20,1
2,96676,9217,7,37749,7,53,17,2222,1023244,1,...,179.0,32,1,30,1,179,-1,-1,32,0
3,42891,10418,7,41352,7,53,17,1641,286483,1,...,106.0,20,1,40,1,106,-1,-1,20,1
4,169326,20763,5,44353,220,443,6,1011,65633087,12,...,562.2,392,12,767,10,5622,65535,79,32,0


In [14]:
X = data2.drop('Class', axis=1)
y = data2['Class']

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [16]:
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled,columns = scaler.get_feature_names_out())

X_scaled.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward
0,-0.302988,-0.254681,-0.301851,0.615201,-0.551042,-0.37499,-0.505654,1.255342,-0.038628,-0.092527,...,0.166762,-0.041883,-0.110092,-0.092527,-0.072645,-0.055528,-0.063037,1.403291,-0.153687,-1.172879
1,-0.325005,0.846472,-0.301851,0.571481,0.048341,-0.351039,1.916993,1.489244,-0.494134,-0.025086,...,3.218552,1.776236,0.007339,-0.025086,0.392864,-0.027333,-0.032898,-0.743407,-0.221689,-1.172879
2,0.154885,-1.665597,-0.301851,-0.047966,-0.931866,-0.376772,1.916993,1.66109,-0.460739,-0.193689,...,-0.27136,-0.007486,-0.186874,-0.193689,-0.129121,-0.083723,-0.066285,-0.743407,-0.221689,0.631082
3,-0.894893,-1.619668,-0.301851,0.147469,-0.931866,-0.376772,1.916993,0.73662,-0.492733,-0.193689,...,-0.201817,-0.235756,-0.200423,-0.193689,-0.127469,-0.083723,-0.066766,-0.743407,-0.221689,-1.172879
4,1.572871,-1.22405,-0.307995,0.31025,-0.696749,-0.351039,-0.505654,-0.265817,2.34498,0.177238,...,-0.035493,1.190775,0.219619,0.177238,-0.007418,0.000862,-0.030421,1.403291,-0.211064,0.631082


In [17]:
# Remove constant features
constant_filter = VarianceThreshold(threshold=0)
X_filtered = constant_filter.fit_transform(X_scaled)
X_filtered = pd.DataFrame(X_filtered,columns = constant_filter.get_feature_names_out())

X_filtered.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward
0,-0.302988,-0.254681,-0.301851,0.615201,-0.551042,-0.37499,-0.505654,1.255342,-0.038628,-0.092527,...,0.166762,-0.041883,-0.110092,-0.092527,-0.072645,-0.055528,-0.063037,1.403291,-0.153687,-1.172879
1,-0.325005,0.846472,-0.301851,0.571481,0.048341,-0.351039,1.916993,1.489244,-0.494134,-0.025086,...,3.218552,1.776236,0.007339,-0.025086,0.392864,-0.027333,-0.032898,-0.743407,-0.221689,-1.172879
2,0.154885,-1.665597,-0.301851,-0.047966,-0.931866,-0.376772,1.916993,1.66109,-0.460739,-0.193689,...,-0.27136,-0.007486,-0.186874,-0.193689,-0.129121,-0.083723,-0.066285,-0.743407,-0.221689,0.631082
3,-0.894893,-1.619668,-0.301851,0.147469,-0.931866,-0.376772,1.916993,0.73662,-0.492733,-0.193689,...,-0.201817,-0.235756,-0.200423,-0.193689,-0.127469,-0.083723,-0.066766,-0.743407,-0.221689,-1.172879
4,1.572871,-1.22405,-0.307995,0.31025,-0.696749,-0.351039,-0.505654,-0.265817,2.34498,0.177238,...,-0.035493,1.190775,0.219619,0.177238,-0.007418,0.000862,-0.030421,1.403291,-0.211064,0.631082


In [18]:
from sklearn.feature_selection import SelectKBest,f_classif
k_best_features = 20  # Example: 16 is a perfect square (4x4)

# Feature selection using SelectKBest
selector = SelectKBest(score_func=f_classif, k=k_best_features)
X_selected = selector.fit_transform(X_filtered, y)

selected_indices = selector.get_support(indices=True)


In [19]:
selected_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_features = X.columns[selected_indices]
print("Selected Features:", selected_features)

Selected Features: Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Destination IP', ' Timestamp',
       ' Flow Duration', ' Fwd Packet Length Max', ' Fwd Packet Length Mean',
       'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Std', ' Flow IAT Max',
       'Fwd IAT Total', ' Fwd IAT Std', ' Fwd IAT Max', ' Bwd Packets/s',
       ' Down/Up Ratio', ' Avg Fwd Segment Size', 'Init_Win_bytes_forward',
       ' min_seg_size_forward'],
      dtype='object')


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)

In [19]:
y_pred = rf_classifier.predict(X_test)


In [20]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

      Benign       1.00      1.00      1.00     26074
      Trojan       1.00      1.00      1.00     27171

    accuracy                           1.00     53245
   macro avg       1.00      1.00      1.00     53245
weighted avg       1.00      1.00      1.00     53245

