In [1]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### **1Ô∏è‚É£ Load Data**

In [2]:
df = pd.read_csv("cleaned_network_data.csv")

### **2Ô∏è‚É£ Explore Data**

In [3]:
df.head()

Unnamed: 0,Dst Port,Protocol,PacketSize,PacketCount,PayloadSize,FlowDirection,TcpFlags,IsMalicious
0,443,6,975533800.0,97,254.470588,7812.5,0,0
1,49684,6,21380.0,21,19.0,7812.5,110,0
2,443,6,1115109000000.0,1115,430.111111,7812.5,0,0
3,443,6,2000.0,20,0.0,7812.5,10,0
4,443,6,91312860000.0,913,322.869565,7812.5,0,0


In [4]:
def full_report(df):
    dtypes = df.dtypes
    n_unique = df.nunique()
    u_ratio = ((n_unique / len(df)) * 100).round(2).astype(str) + '%'
    nulls = df.isnull().sum()
    n_ratio = ((nulls / len(df)) * 100).round(2).astype(str) + '%'
    mod_or_mean = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            try:
                mean_val = df[col].mean()
                mod_or_mean.append(f"mean = {round(mean_val, 2)}")
            except:
                mod_or_mean.append("mean = NaN")
        else:
            try:
                mode_val = df[col].mode().iloc[0]
                mod_or_mean.append(f"mode = {mode_val}")
            except:
                mod_or_mean.append("mode = NaN")
    full_report_df = pd.DataFrame({'DTypes': dtypes,'N_Uniq': n_unique,'U Ratio': u_ratio,'Nulls': nulls,'N_Ratio': n_ratio,'Mode or Mean': mod_or_mean})
    print(f"{'*'*50}")
    print(f'Empty Rows: {df.isnull().all(axis=1).sum()}')
    print(f'Empty Columns: {df.isnull().all(axis=0).sum()}')
    print(f'Duplicate Rows: {df.duplicated().sum()}')
    print(f'Number of Rows: {df.shape[0]}')
    print(f'Number of Columns: {df.shape[1]}')
    print(f"{'*'*50}")
    return full_report_df

In [5]:
full_report(df)

**************************************************
Empty Rows: 0
Empty Columns: 0
Duplicate Rows: 2
Number of Rows: 1264714
Number of Columns: 8
**************************************************


Unnamed: 0,DTypes,N_Uniq,U Ratio,Nulls,N_Ratio,Mode or Mean
Dst Port,int64,64150,5.07%,0,0.0%,mean = 34461.13
Protocol,int64,3,0.0%,0,0.0%,mean = 6.17
PacketSize,float64,314422,24.86%,0,0.0%,mean = 1.0595860500036146e+20
PacketCount,int64,23690,1.87%,0,0.0%,mean = 281742.26
PayloadSize,float64,245340,19.4%,0,0.0%,mean = 173.59
FlowDirection,float64,23465,1.86%,0,0.0%,mean = 214350738642.73
TcpFlags,int64,6,0.0%,0,0.0%,mean = 9.51
IsMalicious,int64,2,0.0%,0,0.0%,mean = 0.04


### **3Ô∏è‚É£ Clean the Data**

- Drop Nulls ROWs

In [6]:
df = df.dropna()

In [7]:
cols_numeric = ["Dst Port", "Protocol", "PacketSize", "PacketCount", "PayloadSize", "FlowDirection", "TcpFlags"]

for col in cols_numeric:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["Dst Port"].fillna(df["Dst Port"].mode()[0], inplace=True)
df["Protocol"].fillna(df["Protocol"].mode()[0], inplace=True)
df["PacketCount"].fillna(df["PacketCount"].median(), inplace=True)
df["TcpFlags"].fillna(df["TcpFlags"].mode()[0], inplace=True)
df["FlowDirection"].fillna(df["FlowDirection"].median(), inplace=True)
df["PacketSize"].fillna(df["PacketSize"].median(), inplace=True)
df["PayloadSize"].fillna(df["PayloadSize"].median(), inplace=True)

df["Dst Port"] = df["Dst Port"].astype("int64")
df["Protocol"] = df["Protocol"].astype("int64")
df["PacketCount"] = df["PacketCount"].astype("int64")
df["TcpFlags"] = df["TcpFlags"].astype("int64")

In [8]:
full_report(df)

**************************************************
Empty Rows: 0
Empty Columns: 0
Duplicate Rows: 2
Number of Rows: 1264714
Number of Columns: 8
**************************************************


Unnamed: 0,DTypes,N_Uniq,U Ratio,Nulls,N_Ratio,Mode or Mean
Dst Port,int64,64150,5.07%,0,0.0%,mean = 34461.13
Protocol,int64,3,0.0%,0,0.0%,mean = 6.17
PacketSize,float64,314422,24.86%,0,0.0%,mean = 1.0595860500036146e+20
PacketCount,int64,23690,1.87%,0,0.0%,mean = 281742.26
PayloadSize,float64,245340,19.4%,0,0.0%,mean = 173.59
FlowDirection,float64,23465,1.86%,0,0.0%,mean = 214350738642.73
TcpFlags,int64,6,0.0%,0,0.0%,mean = 9.51
IsMalicious,int64,2,0.0%,0,0.0%,mean = 0.04


- Drop Duplicate Columns

In [9]:
#Drop Duplicate Columns
df.drop_duplicates(inplace=True)

In [10]:
df["IsMalicious"].value_counts()

Unnamed: 0_level_0,count
IsMalicious,Unnamed: 1_level_1
0,1219929
1,44783


### **8Ô∏è‚É£ Split Data**

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [12]:
features = ["Dst Port", "Protocol", "PacketSize", "PacketCount",
            "PayloadSize", "FlowDirection", "TcpFlags"]

X = df[features]
y = df["IsMalicious"]

### **9Ô∏è‚É£ Scalling**

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
X_normal = X_scaled[y == 0]

### **üîü Model Training**

In [15]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

In [16]:
input_dim = X_normal.shape[1]

input_layer = Input(shape=(input_dim,))
encoded = Dense(16, activation="relu")(input_layer)
encoded = Dense(8, activation="relu")(encoded)
decoded = Dense(16, activation="relu")(encoded)
output_layer = Dense(input_dim, activation="linear")(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

In [17]:
autoencoder.fit(
    X_normal, X_normal,
    epochs=15,
    batch_size=256,
    shuffle=True,
    validation_split=0.1,
    verbose=1
)

Epoch 1/15
[1m4289/4289[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 3ms/step - loss: 0.1889 - val_loss: 1.4754
Epoch 2/15
[1m4289/4289[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m17s[0m 3ms/step - loss: 0.0796 - val_loss: 1.4832
Epoch 3/15
[1m4289/4289[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0070 - val_loss: 1.4797
Epoch 4/15
[1m4289/4289[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m20s[0m 2ms/step - loss: 0.0324 - val_loss: 1.5263
Epoch 5/15
[1m4289/4289[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m11s[0m 2ms/step - loss: 0.0097 - val_loss: 1.6577
Epoch 6/15
[1m4289/4289[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.0087 - val_loss: 1.5566
Epoch 7/15
[1m4289/42

<keras.src.callbacks.history.History at 0x7c0432173e30>

In [18]:
X_pred = autoencoder.predict(X_scaled)
mse = np.mean(np.square(X_scaled - X_pred), axis=1)

[1m39523/39523[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m56s[0m 1ms/step


In [19]:
threshold = np.percentile(mse[y == 0], 95)

In [20]:
df["AE_Prediction"] = (mse > threshold).astype(int)

In [21]:
from sklearn.metrics import classification_report

print(classification_report(y, df["AE_Prediction"]))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96   1219929
           1       0.12      0.19      0.15     44783

    accuracy                           0.92   1264712
   macro avg       0.55      0.57      0.55   1264712
weighted avg       0.94      0.92      0.93   1264712



In [22]:
autoencoder.save('autoencoder_model.h5')



In [23]:
import joblib
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']