In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Features and preprocessing as in the original script
features = features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'subDirLen', 'this.fileExtLen', 'ArgLen',
            'pathurlRatio', 'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio',
            'pathDomainRatio', 'argPathRatio', 'executable', 'isPortEighty',
            'NumberofDotsinURL', 'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount', 'Extension_LetterCount',
            'Query_LetterCount', 'LongestPathTokenLength', 'Domain_LongestWordLength',
            'Path_LongestWordLength', 'sub-Directory_LongestWordLength',
            'Arguments_LongestWordLength', 'URL_sensitiveWord', 'URLQueries_variable',
            'spcharUrl', 'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension', 'SymbolCount_Afterpath',
            'Entropy_URL', 'Entropy_Domain', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'url_type']
              # Same feature list as provided above
numeric_features = [f for f in features if f not in ['tld', 'url_type']]

# Clean dataset
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

X = df_cleaned[numeric_features + ['tld_encoded']]
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split (Binary Classification)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Train-Test Split (Multiclass Classification)
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded'] - 1
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Models to compare
models = {
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Initialize dictionaries to store accuracies for comparison
train_acc_bin, test_acc_bin = {}, {}
train_acc_multi, test_acc_multi = {}, {}

# Function to train and evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test, classification_type):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate accuracy
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    if classification_type == 'binary':
        print(f"Binary Classification Report - Training Data:\n{classification_report(y_train, y_train_pred)}")
        print(f"Binary Classification Report - Test Data:\n{classification_report(y_test, y_test_pred)}")
    else:
        print(f"Multiclass Classification Report - Training Data:\n{classification_report(y_train, y_train_pred)}")
        print(f"Multiclass Classification Report - Test Data:\n{classification_report(y_test, y_test_pred)}")

    return train_acc, test_acc

# Train and evaluate for binary classification
for model_name, model in models.items():
    print(f"\n--- {model_name} Binary Classification ---")
    train_acc_bin[model_name], test_acc_bin[model_name] = evaluate_model(
        model, X_train_bin, y_train_bin, X_test_bin, y_test_bin, classification_type='binary'
    )

# Train and evaluate for multiclass classification
for model_name, model in models.items():
    print(f"\n--- {model_name} Multiclass Classification ---")
    train_acc_multi[model_name], test_acc_multi[model_name] = evaluate_model(
        model, X_train_multi, y_train_multi, X_test_multi, y_test_multi, classification_type='multiclass'
    )

# Plotting binary classification accuracies
plt.figure(figsize=(12, 6))

# Plot Binary Classification Results
plt.subplot(1, 2, 1)
plt.bar(train_acc_bin.keys(), train_acc_bin.values(), color='blue', alpha=0.7, label='Train Accuracy')
plt.bar(test_acc_bin.keys(), test_acc_bin.values(), color='green', alpha=0.7, label='Test Accuracy')
plt.title('Binary Classification Accuracy')
plt.ylabel('Accuracy')
plt.legend()

# Plot Multiclass Classification Results
plt.subplot(1, 2, 2)
plt.bar(train_acc_multi.keys(), train_acc_multi.values(), color='blue', alpha=0.7, label='Train Accuracy')
plt.bar(test_acc_multi.keys(), test_acc_multi.values(), color='green', alpha=0.7, label='Test Accuracy')
plt.title('Multiclass Classification Accuracy')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()



--- KNN Binary Classification ---
Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97    299672
           1       0.96      0.94      0.95    156161

    accuracy                           0.97    455833
   macro avg       0.97      0.96      0.96    455833
weighted avg       0.97      0.97      0.97    455833

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97    128431
           1       0.94      0.92      0.93     66927

    accuracy                           0.95    195358
   macro avg       0.95      0.95      0.95    195358
weighted avg       0.95      0.95      0.95    195358


--- SVM Binary Classification ---
