In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn

In [4]:
session_path = "dataset/session_based_dataset.csv"
df_session = pd.read_csv(session_path, low_memory=False)

In [None]:
df_session.nunique()["unique_link_mark"]

In [None]:
print(list(df_session.columns))
df_session.shape

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
sbn.countplot(data=df_session, x='label')

In [None]:
# for col in df_session.select_dtypes(include='number').columns:
#     plt.figure(figsize=(6, 3))
#     sbn.histplot(df_session[col], bins=50, kde=True)
#     plt.title(f'Distribution of {col}')
#     plt.xlabel(col)
#     plt.ylabel('Frequency')
#     plt.tight_layout()
#     plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# First, split off the test set (15%)
df_train_val, df_test = train_test_split(df_session, test_size=0.15, random_state=42, stratify=df_session["label"])

# Then, split the remaining data into train (70%) and validation (15%)
# Since 15%/(85%) ≈ 0.176, use that as the validation size of the remaining data
df_train, df_val = train_test_split(df_train_val, test_size=0.176, random_state=42, stratify=df_train_val["label"])

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")
print(f"Test shape: {df_test.shape}")

drop_cols = ["label", "unique_link_mark"]
x_train = df_train.drop(columns=drop_cols, errors='ignore').values
y_train = df_train["label"].values

x_val = df_val.drop(columns=drop_cols, errors='ignore').values
y_val = df_val["label"].values

x_test = df_test.drop(columns=drop_cols, errors='ignore').values
y_test = df_test["label"].values

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
model = Sequential()
model.add(Dense(units=128, activation="relu"))
model.add(Dropout(0.20))
model.add(Dense(units=64, activation="relu"))
model.add(Dropout(0.20))
model.add(Dense(units=32, activation="relu"))
model.add(Dropout(0.10))
model.add(Dense(units=1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam")


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(x=x_train, y=y_train, epochs=100, validation_data=(x_val,y_val), verbose=1, callbacks=[early_stop])

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
y_pred_prob = model.predict(x_val)

y_pred = (y_pred_prob > 0.5).astype("int32")

print(classification_report(y_val,y_pred=y_pred))
cm = confusion_matrix(y_val,y_pred=y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Legitimate", "Malicious"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

In [None]:
from tensorflow.keras.models import load_model
model.save("model.keras")

In [None]:
import numpy as np

# pick out the first example
sample = x_test[0]              # shape: (18,)

# reshape to (1, 18)
sample_batch = np.expand_dims(sample, axis=0)

In [None]:
loaded_model = load_model("model.keras")
# now predict
pred = loaded_model.predict(sample_batch)
print(pred)                      # shape: (1, ...) – a single prediction