<h3>Simple RF</h3>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'url_type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()

# Convert 'tld' and 'url_type' to string since they're categorical
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

# Select only numeric features for checking infinite values
numeric_features = [f for f in features if f not in ['tld', 'url_type']]

# Apply np.isfinite only to numeric features and filter rows
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD and url_type (since they're categorical)
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()

df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Combine all features (numeric + 'tld_encoded') for the model
X = df_cleaned[numeric_features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_bin, y_train_bin= smote.fit_resample(X_train_bin, y_train_bin)

# Initialize the Random Forest classifier for binary classification
rf_binary_classifier = RandomForestClassifier(random_state=42)

# Fit the model on the training data for binary classification
rf_binary_classifier.fit(X_train_bin, y_train_bin)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = rf_binary_classifier.predict(X_train_bin)
y_test_pred_bin = rf_binary_classifier.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Use the 'url_type_encoded' column for the target
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Adjust the multiclass labels so that they start from 0
y_train_multi = y_train_multi - 1
y_test_multi = y_test_multi - 1

# Initialize the Random Forest classifier for multiclass classification
rf_multiclass_classifier = RandomForestClassifier(random_state=42)

# Fit the model on the training data for multiclass classification
rf_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = rf_multiclass_classifier.predict(X_train_multi)
y_test_pred_multi = rf_multiclass_classifier.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))


Binary Classification Report (Benign vs Malicious) - Training Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    299672
           1       1.00      1.00      1.00    299672

    accuracy                           1.00    599344
   macro avg       1.00      1.00      1.00    599344
weighted avg       1.00      1.00      1.00    599344

Binary Classification Accuracy (Training): 0.9995795402973918

Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    128431
           1       0.97      0.96      0.96     66927

    accuracy                           0.97    195358
   macro avg       0.97      0.97      0.97    195358
weighted avg       0.97      0.97      0.97    195358

Binary Classification Accuracy (Test): 0.9749792688295335

Multiclass Classification Report (Malicious Type) - Training Data:
              precision  

<h3>RF, Hyperparameter, SMOTE </h3>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.stats import randint

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'url_type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()

# Convert 'tld' and 'url_type' to string since they're categorical
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

# Select only numeric features for checking infinite values
numeric_features = [f for f in features if f not in ['tld', 'url_type']]

# Apply np.isfinite only to numeric features and filter rows
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD and url_type (since they're categorical)
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()

df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Combine all features (numeric + 'tld_encoded') for the model
X = df_cleaned[numeric_features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_bin, y_train_bin = smote.fit_resample(X_train_bin, y_train_bin)

# Initialize the Random Forest classifier for binary classification
rf_binary_classifier = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
grid_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_binary_classifier, param_grid=grid_params, cv=3, n_jobs=-1)
grid_search.fit(X_train_bin, y_train_bin)

# Best parameters from GridSearch
print("Best parameters for binary classification:", grid_search.best_params_)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = grid_search.predict(X_train_bin)
y_test_pred_bin = grid_search.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Use the 'url_type_encoded' column for the target
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Adjust the multiclass labels so that they start from 0
y_train_multi = y_train_multi - 1
y_test_multi = y_test_multi - 1

# Initialize the Random Forest classifier for multiclass classification
rf_multiclass_classifier = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
random_params = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

random_search = RandomizedSearchCV(estimator=rf_multiclass_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train_multi, y_train_multi)

# Best parameters from RandomizedSearch
print("Best parameters for multiclass classification:", random_search.best_params_)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = random_search.predict(X_train_multi)
y_test_pred_multi = random_search.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))


Best parameters for binary classification: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Binary Classification Report (Benign vs Malicious) - Training Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    299672
           1       1.00      1.00      1.00    299672

    accuracy                           1.00    599344
   macro avg       1.00      1.00      1.00    599344
weighted avg       1.00      1.00      1.00    599344

Binary Classification Accuracy (Training): 0.9995812087882752

Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    128431
           1       0.97      0.96      0.96     66927

    accuracy                           0.98    195358
   macro avg       0.97      0.97      0.97    195358
weighted avg       0.98      0.98      0.98    195358

Binary Classificatio

<h3>RF, Hyperparameter</h3>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.stats import randint

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'url_type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()

# Convert 'tld' and 'url_type' to string since they're categorical
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

# Select only numeric features for checking infinite values
numeric_features = [f for f in features if f not in ['tld', 'url_type']]

# Apply np.isfinite only to numeric features and filter rows
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD and url_type (since they're categorical)
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()

df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Combine all features (numeric + 'tld_encoded') for the model
X = df_cleaned[numeric_features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize the Random Forest classifier for binary classification
rf_binary_classifier = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
grid_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_binary_classifier, param_grid=grid_params, cv=3, n_jobs=-1)
grid_search.fit(X_train_bin, y_train_bin)

# Best parameters from GridSearch
print("Best parameters for binary classification:", grid_search.best_params_)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = grid_search.predict(X_train_bin)
y_test_pred_bin = grid_search.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Use the 'url_type_encoded' column for the target
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Adjust the multiclass labels so that they start from 0
y_train_multi = y_train_multi - 1
y_test_multi = y_test_multi - 1

# Initialize the Random Forest classifier for multiclass classification
rf_multiclass_classifier = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
random_params = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

random_search = RandomizedSearchCV(estimator=rf_multiclass_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train_multi, y_train_multi)

# Best parameters from RandomizedSearch
print("Best parameters for multiclass classification:", random_search.best_params_)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = random_search.predict(X_train_multi)
y_test_pred_multi = random_search.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))


Best parameters for binary classification: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Binary Classification Report (Benign vs Malicious) - Training Data:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    299672
           1       0.99      0.98      0.99    156161

    accuracy                           0.99    455833
   macro avg       0.99      0.99      0.99    455833
weighted avg       0.99      0.99      0.99    455833

Binary Classification Accuracy (Training): 0.99161096278681

Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98    128431
           1       0.98      0.95      0.96     66927

    accuracy                           0.98    195358
   macro avg       0.98      0.97      0.97    195358
weighted avg       0.98      0.98      0.98    195358

Binary Classification Ac