<h3>SVM</h3>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'URL_Type_obf_Type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)  # Convert 'tld' to string
numeric_features = [f for f in features if f != 'tld']
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD (since it's categorical)
df_cleaned['tld_encoded'] = LabelEncoder().fit_transform(df_cleaned['tld'])

# Combine all features (without the original 'tld' column)
X = df_cleaned[numeric_features + ['tld_encoded']]

# Standardize features (important for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_scaled, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize the SVM classifier for binary classification
svm_binary_classifier = SVC(random_state=42)

# Fit the model on the training data for binary classification
svm_binary_classifier.fit(X_train_bin, y_train_bin)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = svm_binary_classifier.predict(X_train_bin)
y_test_pred_bin = svm_binary_classifier.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Encode URL types into categories: spam, phishing, malware, defacement
label_encoder = LabelEncoder()
malicious_df['url_type_encoded'] = label_encoder.fit_transform(malicious_df['url_type'])

# Select features and target for multiclass classification
X_multi = malicious_df[numeric_features + ['tld_encoded']]
X_multi_scaled = scaler.transform(X_multi)  # Scale features for multiclass classification
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi_scaled, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Initialize the SVM classifier for multiclass classification
svm_multiclass_classifier = SVC(random_state=42)

# Fit the model on the training data for multiclass classification
svm_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = svm_multiclass_classifier.predict(X_train_multi)
y_test_pred_multi = svm_multiclass_classifier.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))

<h2>SVM + SMOTE</h2>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'URL_Type_obf_Type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)  # Convert 'tld' to string
numeric_features = [f for f in features if f != 'tld']
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD (since it's categorical)
df_cleaned['tld_encoded'] = LabelEncoder().fit_transform(df_cleaned['tld'])

# Combine all features (without the original 'tld' column)
X = df_cleaned[numeric_features + ['tld_encoded']]

# Standardize features (important for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_scaled, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Apply SMOTE for the training set to handle class imbalance
smote = SMOTE(random_state=42)
X_train_bin_smote, y_train_bin_smote = smote.fit_resample(X_train_bin, y_train_bin)

# Initialize the SVM classifier for binary classification
svm_binary_classifier = SVC(random_state=42)

# Fit the model on the SMOTE-augmented training data for binary classification
svm_binary_classifier.fit(X_train_bin_smote, y_train_bin_smote)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = svm_binary_classifier.predict(X_train_bin_smote)
y_test_pred_bin = svm_binary_classifier.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin_smote, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin_smote, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Encode URL types into categories: spam, phishing, malware, defacement
label_encoder = LabelEncoder()
malicious_df['url_type_encoded'] = label_encoder.fit_transform(malicious_df['url_type'])

# Select features and target for multiclass classification
X_multi = malicious_df[numeric_features + ['tld_encoded']]
X_multi_scaled = scaler.transform(X_multi)  # Scale features for multiclass classification
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi_scaled, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Apply SMOTE for the training set in multiclass classification
X_train_multi_smote, y_train_multi_smote = smote.fit_resample(X_train_multi, y_train_multi)

# Initialize the SVM classifier for multiclass classification
svm_multiclass_classifier = SVC(random_state=42)

# Fit the model on the SMOTE-augmented training data for multiclass classification
svm_multiclass_classifier.fit(X_train_multi_smote, y_train_multi_smote)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = svm_multiclass_classifier.predict(X_train_multi_smote)
y_test_pred_multi = svm_multiclass_classifier.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi_smote, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi_smote, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin_smote, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi_smote, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))