## Imports

In [175]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

## Data Pre-processing
Data can be found [here](https://archive.ics.uci.edu/dataset/542/internet+firewall+data)

In [176]:
# Load Data
df = pd.read_csv('log2.csv')

# Map Ports to Services
port_to_service = {53: "DNS", 80: "HTTP", 443: "HTTPS", 3389: "RDP", 22: "SSH", 123: "NTP"}
df["TrafficType"] = df["Destination Port"].map(port_to_service).fillna("Unknown")

# Encode TrafficType
le = LabelEncoder()
df["TrafficType"] = le.fit_transform(df["TrafficType"])

# Drop Irrelevant Columns
df = df.drop(["Source Port", "Destination Port", "NAT Source Port", "NAT Destination Port", "Action"], axis=1)

## Model Fitting

In [177]:
# Features and Target
X = df.drop(["TrafficType"], axis=1)
y = df["TrafficType"]

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3)

# Apply SMOTE
smote = SMOTE(sampling_strategy="auto")  # Balance all classes
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=3, weights="distance")
knn.fit(X_train_scaled, y_train_resampled)
y_pred = knn.predict(X_test_scaled)

## Prediction

In [178]:
# Decode Predictions
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

# Evaluate
print(classification_report(y_test_labels, y_pred_labels))
print("Accuracy Score:", accuracy_score(y_test_labels, y_pred_labels))

              precision    recall  f1-score   support

         DNS       0.98      0.98      0.98     10869
        HTTP       0.63      0.75      0.69      2787
       HTTPS       0.88      0.84      0.86      8179
         NTP       0.67      0.05      0.09       118
         RDP       0.44      0.72      0.54       134
         SSH       0.16      0.20      0.18        76
     Unknown       0.98      0.97      0.98     23710

    accuracy                           0.93     45873
   macro avg       0.68      0.64      0.62     45873
weighted avg       0.94      0.93      0.93     45873

Accuracy Score: 0.9342968630784994
