In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [22]:
file_paths={
    'Sat':'TestbedSatJun12Flows.csv',
    'Sun':'TestbedSunJun13Flows.csv',
    'Mon':'TestbedMonJun14Flows.csv',
    'Tue':'TestbedTueJun15Flows.csv',
    'Wed':'TestbedWedJun16Flows.csv',
    'Thu':'TestbedThuJun17Flows.csv'}

In [23]:
frames=[]
for day,path in file_paths.items():
    data2012=pd.read_csv(path)
    data2012['day']=day
    frames.append(data2012)

all_data2012=pd.concat(frames,ignore_index=True)
print(f"Loaded total rows: {len(all_data2012)}")

Loaded total rows: 2071657


In [24]:
for day in file_paths:
    print(day, (all_data2012['day'] == day).sum())

Sat 133193
Sun 275528
Mon 171380
Tue 571698
Wed 522263
Thu 397595


In [25]:
print("Rows before dropna:", len(all_data2012))
print(all_data2012['day'].value_counts())

cat_cols_2012 = ['direction', 'sourceTCPFlagsDescription', 'destinationTCPFlagsDescription', 'protocolName']
print("\nMissing values before drop:")
print(all_data2012[cat_cols_2012 + ['Label']].isna().sum())

Rows before dropna: 2071657
day
Tue    571698
Wed    522263
Thu    397595
Sun    275528
Mon    171380
Sat    133193
Name: count, dtype: int64

Missing values before drop:
direction                              0
sourceTCPFlagsDescription         430943
destinationTCPFlagsDescription    493422
protocolName                           0
Label                                  0
dtype: int64


In [26]:
session_2012=['totalSourceBytes','totalDestinationBytes','totalSourcePackets',
    'totalDestinationPackets','direction','sourceTCPFlagsDescription',
    'destinationTCPFlagsDescription','protocolName','sourcePort','destinationPort',
    'startDateTime','stopDateTime']
all_data2012=all_data2012[session_2012+['Label','day']]
all_data2012=all_data2012.copy()

In [27]:
#flow duration
all_data2012['startDateTime']=pd.to_datetime(all_data2012['startDateTime'],errors='coerce')
all_data2012['stopDateTime']=pd.to_datetime(all_data2012['stopDateTime'],errors='coerce')
all_data2012['duration']=(all_data2012['stopDateTime']-all_data2012['startDateTime']).dt.total_seconds()
all_data2012.drop(['startDateTime','stopDateTime'],axis=1,inplace=True)
all_data2012['duration']=all_data2012['duration'].fillna(0).clip(lower=0)

#packet ratio
all_data2012['packet_ratio']=all_data2012['totalSourcePackets']/all_data2012['totalDestinationPackets'].replace(0,np.nan)
all_data2012['packet_ratio']=all_data2012['packet_ratio'].fillna(0).clip(upper=10)

#byte ratio
all_data2012['byte_ratio']=all_data2012['totalSourceBytes']/all_data2012['totalDestinationBytes'].replace(0,np.nan)
all_data2012['byte_ratio']=all_data2012['byte_ratio'].fillna(0).clip(upper=10)

#average packet size source
all_data2012['avg_packet_size_src']=all_data2012['totalSourceBytes']/all_data2012['totalSourcePackets'].replace(0,np.nan)
all_data2012['avg_packet_size_src']=all_data2012['avg_packet_size_src'].fillna(0).clip(upper=1500)

#average packet size destination
all_data2012['avg_packet_size_dst']=all_data2012['totalDestinationBytes']/all_data2012['totalDestinationPackets'].replace(0,np.nan)
all_data2012['avg_packet_size_dst']=all_data2012['avg_packet_size_dst'].fillna(0).clip(upper=1500)

In [28]:
all_data2012.head()

Unnamed: 0,totalSourceBytes,totalDestinationBytes,totalSourcePackets,totalDestinationPackets,direction,sourceTCPFlagsDescription,destinationTCPFlagsDescription,protocolName,sourcePort,destinationPort,Label,day,duration,packet_ratio,byte_ratio,avg_packet_size_src,avg_packet_size_dst
0,128,64,2,1,L2R,"F,A","F,A",tcp_ip,22441,80,Normal,Sat,0.0,2.0,2.0,64.0,64.0
1,128,64,2,1,L2R,"F,A","F,A",tcp_ip,22445,80,Normal,Sat,0.0,2.0,2.0,64.0,64.0
2,128,64,2,1,L2R,"F,A","F,A",tcp_ip,22444,80,Normal,Sat,0.0,2.0,2.0,64.0,64.0
3,2938,49570,27,76,L2L,"S,R,P,A","S,P,A",tcp_ip,3248,22,Normal,Sat,0.0,0.355263,0.05927,108.814815,652.236842
4,644,2315,7,5,L2R,"S,P,A","S,P,A",tcp_ip,1867,80,Normal,Sat,0.0,1.4,0.278186,92.0,463.0


In [29]:
all_data2012.dtypes

totalSourceBytes                    int64
totalDestinationBytes               int64
totalSourcePackets                  int64
totalDestinationPackets             int64
direction                          object
sourceTCPFlagsDescription          object
destinationTCPFlagsDescription     object
protocolName                       object
sourcePort                          int64
destinationPort                     int64
Label                              object
day                                object
duration                          float64
packet_ratio                      float64
byte_ratio                        float64
avg_packet_size_src               float64
avg_packet_size_dst               float64
dtype: object

In [30]:
cat_cols_2012=['direction','sourceTCPFlagsDescription','destinationTCPFlagsDescription',
                  'protocolName']
all_data2012 = all_data2012.dropna(subset=['Label'])
for col in cat_cols_2012:
    all_data2012[col] = all_data2012[col].fillna("UNKNOWN")

In [31]:
label_encoders = {}

for col in cat_cols_2012:
    le = LabelEncoder()
    all_data2012[col] = le.fit_transform(all_data2012[col])
    label_encoders[col] = le 

In [32]:
train_days=['Sat','Sun','Mon','Tue']
test_day='Wed'
eval_day='Thu'

train_df=all_data2012[all_data2012['day'].isin(train_days)]
test_df=all_data2012[all_data2012['day']==test_day]
eval_df=all_data2012[all_data2012['day']==eval_day]

X_train=train_df.drop(['Label','day'],axis=1)
y_train=train_df['Label']
X_test=test_df.drop(['Label','day'],axis=1)
y_test=test_df['Label']
X_eval=eval_df.drop(['Label','day'],axis=1)
y_eval=eval_df['Label']

In [33]:
clf=RandomForestClassifier(n_estimators=100,random_state=42,n_jobs=-1,max_depth=15,class_weight='balanced')
clf.fit(X_train,y_train)

In [34]:
y_pred=clf.predict(X_test)
print("Test Set Performance (Wed):")
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred,target_names=['Attack','Normal']))

Test Set Performance (Wed):
[[    11      0]
 [    46 522206]]
              precision    recall  f1-score   support

      Attack       0.19      1.00      0.32        11
      Normal       1.00      1.00      1.00    522252

    accuracy                           1.00    522263
   macro avg       0.60      1.00      0.66    522263
weighted avg       1.00      1.00      1.00    522263



In [35]:
y_eval_pred=clf.predict(X_eval)
print("\nFinal Evaluation (Thu):")
print(confusion_matrix(y_eval,y_eval_pred))
print(classification_report(y_eval,y_eval_pred, target_names=['Attack','Normal']))


Final Evaluation (Thu):
[[   546   4673]
 [    46 392330]]
              precision    recall  f1-score   support

      Attack       0.92      0.10      0.19      5219
      Normal       0.99      1.00      0.99    392376

    accuracy                           0.99    397595
   macro avg       0.96      0.55      0.59    397595
weighted avg       0.99      0.99      0.98    397595

