In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['pathDomainRatio', 'domainUrlRatio', 'tld', 
            'Entropy_Domain', 'domain_token_count', 'pathurlRatio', 'delimeter_Domain', 
            'SymbolCount_Domain', 'ldl_domain', 'avgdomaintokenlen', 'longdomaintokenlen', 
            'Domain_LongestWordLength', 'host_letter_count',
              'subDirLen', 'sub-Directory_LongestWordLength','avgpathtokenlen',
            'spcharUrl','SymbolCount_URL','Directory_LetterCount','ldl_path']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

X = df_cleaned[numeric_features + ['tld_encoded']]
y = df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Scaling the features for KNN
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize and fit the KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for binary classification
y_train_pred = knn_classifier.predict(X_train)
y_test_pred = knn_classifier.predict(X_test)

# Classification reports
print("Binary Classification Report - Training Data:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Multiclass Classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded'] - malicious_df['url_type_encoded'].min()
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    scaler.fit_transform(X_multi), y_multi, test_size=0.3, random_state=42, stratify=y_multi
)
knn_multi_classifier = KNeighborsClassifier(n_neighbors=5)
knn_multi_classifier.fit(X_train_multi, y_train_multi)

# Predictions and Evaluations for Multiclass
y_train_pred_multi = knn_multi_classifier.predict(X_train_multi)
y_test_pred_multi = knn_multi_classifier.predict(X_test_multi)
print("Multiclass Classification Report (Training):")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Report (Test):")
print(classification_report(y_test_multi, y_test_pred_multi))

# Accuracy Summary
train_accuracy_bin = accuracy_score(y_train, y_train_pred)
test_accuracy_bin = accuracy_score(y_test, y_test_pred)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")


Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97    299672
           1       0.95      0.91      0.93    156161

    accuracy                           0.95    455833
   macro avg       0.95      0.94      0.95    455833
weighted avg       0.95      0.95      0.95    455833

Training Accuracy: 0.9539765659792073

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96    128431
           1       0.94      0.89      0.91     66927

    accuracy                           0.94    195358
   macro avg       0.94      0.93      0.94    195358
weighted avg       0.94      0.94      0.94    195358

Test Accuracy: 0.9425567419813881
Multiclass Classification Report (Training):
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     67520
           1       0.98      0.