In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [8]:
data_path = 'C:/Users/vipul/Downloads/intrudtion'

#List of parquet files
parquet_files = [
    "Benign-Monday-no-metadata.parquet",
    "Botnet-Friday-no-metadata.parquet",
    "Bruteforce-Tuesday-no-metadata.parquet",
    "DDoS-Friday-no-metadata.parquet",
    "DoS-Wednesday-no-metadata.parquet",
    "Infiltration-Thursday-no-metadata.parquet",
    "Portscan-Friday-no-metadata.parquet",
    "WebAttacks-Thursday-no-metadata.parquet"
]

dataframes = []
for file in parquet_files:
    file_path = os.path.join(data_path,file)
    df = pd.read_parquet(file_path)
    dataframes.append(df)

In [31]:
full_df = pd.concat(dataframes , ignore_index=True)
full_df.shape

(2313810, 78)

In [32]:
full_df.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,6,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,6,609,7,4,484,414,233,0,69.14286,111.967896,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [33]:
null_counts = full_df.isna().sum()
null_counts

Protocol                    0
Flow Duration               0
Total Fwd Packets           0
Total Backward Packets      0
Fwd Packets Length Total    0
                           ..
Idle Mean                   0
Idle Std                    0
Idle Max                    0
Idle Min                    0
Label                       0
Length: 78, dtype: int64

In [34]:
null_counts[null_counts != 0]

Series([], dtype: int64)

In [35]:
#drop duplicate values - 
full_df = full_df.drop_duplicates()

#Drop null values
full_df = full_df.dropna()

In [36]:
full_df.shape

(2231806, 78)

In [37]:
full_df.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,6,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,6,609,7,4,484,414,233,0,69.14286,111.967896,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [38]:
full_df.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Fla

In [39]:
full_df['Label'].value_counts()

Label
Benign                        1895314
DoS Hulk                       172846
DDoS                           128014
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
PortScan                         1956
Web Attack � Brute Force         1470
Bot                              1437
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [40]:
full_df['Label'].value_counts()/len(full_df)

Label
Benign                        0.849229
DoS Hulk                      0.077447
DDoS                          0.057359
DoS GoldenEye                 0.004609
FTP-Patator                   0.002657
DoS slowloris                 0.002413
DoS Slowhttptest              0.002342
SSH-Patator                   0.001442
PortScan                      0.000876
Web Attack � Brute Force      0.000659
Bot                           0.000644
Web Attack � XSS              0.000292
Infiltration                  0.000016
Web Attack � Sql Injection    0.000009
Heartbleed                    0.000005
Name: count, dtype: float64

In [41]:
full_df['Label'] = full_df['Label'].str.lower()
full_df['Label'] = full_df['Label'].str.strip()

In [42]:
full_df['Label'].value_counts()

Label
benign                        1895314
dos hulk                       172846
ddos                           128014
dos goldeneye                   10286
ftp-patator                      5931
dos slowloris                    5385
dos slowhttptest                 5228
ssh-patator                      3219
portscan                         1956
web attack � brute force         1470
bot                              1437
web attack � xss                  652
infiltration                       36
web attack � sql injection         21
heartbleed                         11
Name: count, dtype: int64

In [43]:
full_df['Label'] = full_df['Label'].apply(lambda x : x if x in ['benign','dos hulk','ddos','dos goldeneye'] else 'other attack')

In [44]:
full_df['Label'].value_counts()

Label
benign           1895314
dos hulk          172846
ddos              128014
other attack       25346
dos goldeneye      10286
Name: count, dtype: int64

In [45]:
full_df.head(3)

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,benign
1,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,benign
2,6,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,benign


In [46]:
X = full_df.drop(columns = 'Label')
y = full_df['Label']

In [47]:
xtrain, xtest, ytrain, ytest = train_test_split(X,y, train_size=0.8)

scaler = StandardScaler()

scaler.fit(xtrain)

xtrainscaled = scaler.transform(xtrain)
xtestscaled = scaler.transform(xtest)

In [48]:
model_log = LogisticRegression(max_iter=500)
model_log.fit(xtrainscaled , ytrain)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
ytrainPred = model_log.predict(xtrain)
ytestPred = model_log.predict(xtest)

In [50]:
print("Accuracy on training data", accuracy_score(ytrain, ytrainPred))
print("Accuracy on test data", accuracy_score(ytest, ytestPred))

Accuracy on training data 0.9383934752364118
Accuracy on test data 0.9385319538849634


In [51]:
print("Classification report training data-")
print(classification_report(ytrain, ytrainPred))

Classification report training data-
               precision    recall  f1-score   support

       benign       0.96      0.97      0.97   1516514
         ddos       0.73      0.78      0.75    102248
dos goldeneye       0.75      0.69      0.72      8141
     dos hulk       0.93      0.80      0.86    138346
 other attack       0.22      0.23      0.22     20195

     accuracy                           0.94   1785444
    macro avg       0.72      0.69      0.70   1785444
 weighted avg       0.94      0.94      0.94   1785444



In [52]:
print("Classification report test data-")
print(classification_report(ytest, ytestPred))

Classification report test data-
               precision    recall  f1-score   support

       benign       0.96      0.97      0.97    378800
         ddos       0.73      0.78      0.76     25766
dos goldeneye       0.75      0.69      0.72      2145
     dos hulk       0.93      0.81      0.86     34500
 other attack       0.22      0.22      0.22      5151

     accuracy                           0.94    446362
    macro avg       0.72      0.69      0.71    446362
 weighted avg       0.94      0.94      0.94    446362



In [56]:
xtest.values[100]

array([6.00000000e+00, 6.19218830e+07, 1.40000000e+01, 1.40000000e+01,
       1.03200000e+03, 5.98400000e+03, 4.37000000e+02, 0.00000000e+00,
       7.37142868e+01, 1.43973511e+02, 1.46000000e+03, 0.00000000e+00,
       4.27428558e+02, 6.04990723e+02, 1.13304048e+02, 4.52182631e-01,
       2.29340300e+06, 4.17294075e+06, 1.00000000e+07, 1.00000000e+00,
       6.19000000e+07, 4.76322200e+06, 5.15501450e+06, 1.02000000e+07,
       3.84000000e+02, 6.17000000e+07, 4.74863900e+06, 5.20646950e+06,
       1.02000000e+07, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.92000000e+02, 3.52000000e+02,
       2.26091310e-01, 2.26091310e-01, 0.00000000e+00, 1.46000000e+03,
       2.41931030e+02, 4.61521790e+02, 2.13002359e+05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       2.50571426e+02, 7.37142868e+01, 4.27428558e+02, 0.00000000e+00,
      

In [57]:
print(model_log.predict([xtest.values[100]]))

['benign']


