In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import joblib
import warnings
warnings.filterwarnings("ignore")

# === 1. Load the dataset ===
df = pd.read_csv("logs.csv")

# === 2. Drop rows with missing values (optional) ===
df.dropna(inplace=True)

# === 3. Encode categorical features ===
cat_features = ['source_ip', 'destination_ip', 'protocol', 'user_agent', 'location', 'port']
encoders = {}

for col in cat_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# === 4. Convert numeric fields ===
numeric_cols = ['bytes_sent', 'bytes_received']
df[numeric_cols] = df[numeric_cols].astype(float)

# === 5. Encode target ===
target_encoder = LabelEncoder()
df['threat'] = target_encoder.fit_transform(df['threat'])

# === 6. Feature / Label split ===
X = df.drop(columns=['timestamp', 'threat'])
y = df['threat']

# === 7. Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === 8. Feature scaling ===
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === 9. Define models ===
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB()
}

# === 10. Train, evaluate, select best ===
best_model = None
best_f1 = 0

print("\nðŸ“Š Model Evaluation Results:\n")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_model_name = name

# === 11. Save the best model ===
joblib.dump(best_model, "best_model.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(encoders, "encoders.joblib")
joblib.dump(target_encoder, "target_encoder.joblib")

print(f"\nâœ… Best model: {best_model_name} with F1 score: {best_f1:.4f}")
print("ðŸ”’ Model and preprocessing objects saved for future use.")


ðŸ“Š Model Evaluation Results:

--- Logistic Regression ---
Accuracy: 0.8440
Precision: 0.7123
Recall: 0.8440
F1 Score: 0.7726

--- Random Forest ---
Accuracy: 0.8430
Precision: 0.7122
Recall: 0.8430
F1 Score: 0.7721

--- Decision Tree ---
Accuracy: 0.7120
Precision: 0.7317
Recall: 0.7120
F1 Score: 0.7216

--- KNN ---
Accuracy: 0.8370
Precision: 0.7143
Recall: 0.8370
F1 Score: 0.7708

--- SVM ---
Accuracy: 0.8440
Precision: 0.7123
Recall: 0.8440
F1 Score: 0.7726

--- Gradient Boosting ---
Accuracy: 0.8300
Precision: 0.7231
Recall: 0.8300
F1 Score: 0.7674

--- Naive Bayes ---
Accuracy: 0.8440
Precision: 0.7123
Recall: 0.8440
F1 Score: 0.7726


âœ… Best model: Logistic Regression with F1 score: 0.7726
ðŸ”’ Model and preprocessing objects saved for future use.


In [8]:
import pandas as pd
import joblib

# Load the pretrained components
model = joblib.load("best_model.joblib")
scaler = joblib.load("scaler.joblib")
encoders = joblib.load("encoders.joblib")
target_encoder = joblib.load("target_encoder.joblib")

# Load your CSV
df = pd.read_csv("logs_2.csv")

# Get the original feature order from the scaler
feature_order = scaler.feature_names_in_  # This contains the original training feature order

# Drop rows with missing values in the feature columns
df.dropna(subset=feature_order, inplace=True)

# Encode categorical features using saved LabelEncoders
for col in ["source_ip", "destination_ip", "protocol", "user_agent", "location", "port"]:
    le = encoders[col]
    mapping = {cls: idx for idx, cls in enumerate(le.classes_)}
    df[col] = df[col].map(mapping).fillna(-1).astype(int)  # unseen values mapped to -1

# Convert numeric columns to float
df["bytes_sent"] = df["bytes_sent"].astype(float)
df["bytes_received"] = df["bytes_received"].astype(float)

# Prepare feature matrix in correct order (using scaler's original order)
X = df[feature_order]

# Scale features
X_scaled = scaler.transform(X)

# Predict
preds = model.predict(X_scaled)
df["predicted_threat"] = target_encoder.inverse_transform(preds)

# Show results
print(df[list(feature_order) + ["predicted_threat"]].head(20))

    source_ip  destination_ip  protocol  port  user_agent  location  \
0          -1              -1         2     6           1         9   
1          -1              -1         0     6           4         4   
2          -1              -1         2     6           0         7   
3          -1              -1         1     0           2         6   
4          -1              -1         2     4           2         9   
5          -1              -1         2     1           3         6   
6          -1              -1         1     4           1         1   
7          -1              -1         0     1           1         2   
8          -1              -1         1     5           4         6   
9          -1              -1         2     1           1         3   
10         -1              -1         0     2           2         9   
11         -1              -1         1     2           1         4   
12         -1              -1         2     2           1         0   
13    