<h3>SVM, Threshold</h3>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from scipy.stats import uniform

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'path_token_count', 'avgdomaintokenlen',
       'longdomaintokenlen', 'avgpathtokenlen', 'charcompvowels',
       'ldl_filename', 'dld_url', 'dld_getArg', 'domainlength', 'fileNameLen',
       'this.fileExtLen', 'NumberofDotsinURL', 'URL_DigitCount',
       'host_DigitCount', 'Directory_DigitCount', 'File_name_DigitCount',
       'Extension_DigitCount', 'host_letter_count', 'Directory_LetterCount',
       'Filename_LetterCount', 'Domain_LongestWordLength',
       'Path_LongestWordLength', 'sub-Directory_LongestWordLength',
       'Arguments_LongestWordLength', 'URLQueries_variable', 'spcharUrl',
       'delimeter_path', 'SymbolCount_URL', 'SymbolCount_Directoryname',
       'SymbolCount_FileName', 'SymbolCount_Afterpath', 'Entropy_Afterpath',
       'url_type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()

# Convert 'tld' and 'url_type' to string since they're categorical
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

# Select only numeric features for checking infinite values
numeric_features = [f for f in features if f not in ['url_type']]

# Apply np.isfinite only to numeric features and filter rows
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD and url_type (since they're categorical)
label_encoder_url_type = LabelEncoder()

df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Combine all features (numeric + 'url_type_encoded') for the model
X = df_cleaned[['url_type_encoded']]  # Ensure X is a DataFrame (2D array)

# Standardize features (important for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Now X_scaled is a 2D array

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_scaled, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)


# Initialize the SVM classifier and RandomizedSearchCV for binary classification
svm_binary_classifier = SVC(random_state=42, probability=True)

# Fit the model with RandomizedSearchCV on the training data
svm_binary_classifier.fit(X_train_bin, y_train_bin)

# Make predictions using decision function to set a threshold
threshold = 0.85
y_train_score_bin = svm_binary_classifier.decision_function(X_train_bin)
y_train_pred_bin = (y_train_score_bin > threshold).astype(int)

y_test_score_bin = svm_binary_classifier.decision_function(X_test_bin)
y_test_pred_bin = (y_test_score_bin > threshold).astype(int)

# Evaluate the binary model
print("Binary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Encode URL types into categories: spam, phishing, malware, defacement
label_encoder = LabelEncoder()
malicious_df['url_type_encoded'] = label_encoder.fit_transform(malicious_df['url_type'])

# Select features and target for multiclass classification
X_multi = malicious_df[numeric_features]
X_multi_scaled = scaler.transform(X_multi)  # Scale features for multiclass classification
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi_scaled, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Initialize the SVM classifier and RandomizedSearchCV for multiclass classification
svm_multiclass_classifier = SVC(random_state=42, probability=True)

# Make predictions on the test data for multiclass classification
y_test_pred_multi = svm_multiclass_classifier.predict(X_test_multi)

# Evaluate the multiclass model
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")



Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    128431
           1       1.00      1.00      1.00     66927

    accuracy                           1.00    195358
   macro avg       1.00      1.00      1.00    195358
weighted avg       1.00      1.00      1.00    195358

Binary Classification Accuracy (Test): 1.0


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Arguments_LongestWordLength
- Directory_DigitCount
- Directory_LetterCount
- Domain_LongestWordLength
- Entropy_Afterpath
- ...
Feature names seen at fit time, yet now missing:
- url_type_encoded
