<h3>Random Forest</h3>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score


# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['domain_token_count', 'path_token_count', 'avgdomaintokenlen', 'longdomaintokenlen', 'ldl_domain', 'ldl_path', 'subDirLen', 'pathurlRatio', 'argDomanRatio', 'domainUrlRatio', 'NumberofDotsinURL', 'CharacterContinuityRate', 'host_DigitCount', 'host_letter_count', 'Directory_LetterCount', 'Domain_LongestWordLength', 'sub-Directory_LongestWordLength', 'URLQueries_variable', 'delimeter_Domain', 'delimeter_Count', 'NumberRate_Domain', 'SymbolCount_URL', 'SymbolCount_Domain', 'Entropy_Domain', 'tld']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])
X = df_cleaned[numeric_features + ['tld_encoded']]
y = df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize and fit the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predictions
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)

# Classification reports
print("Binary Classification Report - Training Data:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Multiclass Classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.4, random_state=42, stratify=y_multi
)
rf_multiclass_classifier = RandomForestClassifier(random_state=42)
rf_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Predictions and Evaluations for Multiclass
y_train_pred_multi = rf_multiclass_classifier.predict(X_train_multi)
y_test_pred_multi = rf_multiclass_classifier.predict(X_test_multi)
print("Multiclass Classification Report (Training):")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Report (Test):")
print(classification_report(y_test_multi, y_test_pred_multi))

# Accuracy Summary
train_accuracy_bin = accuracy_score(y_train, y_train_pred)
test_accuracy_bin = accuracy_score(y_test, y_test_pred)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")


Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    299672
           1       0.99      0.99      0.99    156161

    accuracy                           0.99    455833
   macro avg       0.99      0.99      0.99    455833
weighted avg       0.99      0.99      0.99    455833

Training Accuracy: 0.992402919490252

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    128431
           1       0.97      0.95      0.96     66927

    accuracy                           0.97    195358
   macro avg       0.97      0.97      0.97    195358
weighted avg       0.97      0.97      0.97    195358

Test Accuracy: 0.9722458256124653
Multiclass Classification Report (Training):
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     57874
           2       1.00      0.9

In [40]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['pathDomainRatio', 'domainUrlRatio', 'tld', 'Entropy_Domain', 'domain_token_count', 
            'pathurlRatio', 'delimeter_Domain', 'SymbolCount_Domain', 'ldl_domain', 'avgdomaintokenlen', 
            'longdomaintokenlen', 'Domain_LongestWordLength', 'host_letter_count', 'subDirLen', 
            'sub-Directory_LongestWordLength','avgpathtokenlen','spcharUrl','SymbolCount_URL',
            'Directory_LetterCount','ldl_path','delimeter_Count','NumberofDotsinURL','Path_LongestWordLength',
            'LongestPathTokenLength','CharacterContinuityRate','argDomanRatio','argPathRatio',
            'Entropy_Filename','Entropy_URL','path_token_count']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

X = df_cleaned[numeric_features + ['tld_encoded']]
y = df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize and fit the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
# Hyperparameter tuning using Grid Search
random_params = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

rf_classifier = RandomizedSearchCV(estimator=rf_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predictions
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)

# Classification reports
print("Binary Classification Report - Training Data:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Multiclass Classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)
rf_multiclass_classifier = RandomForestClassifier(random_state=42)
rf_multiclass_classifier = RandomizedSearchCV(estimator=rf_multiclass_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
rf_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Predictions and Evaluations for Multiclass
y_train_pred_multi = rf_multiclass_classifier.predict(X_train_multi)
y_test_pred_multi = rf_multiclass_classifier.predict(X_test_multi)
print("Multiclass Classification Report (Training):")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Report (Test):")
print(classification_report(y_test_multi, y_test_pred_multi))

# Accuracy Summary
train_accuracy_bin = accuracy_score(y_train, y_train_pred)
test_accuracy_bin = accuracy_score(y_test, y_test_pred)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")

Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    299672
           1       1.00      0.99      0.99    156161

    accuracy                           0.99    455833
   macro avg       0.99      0.99      0.99    455833
weighted avg       0.99      0.99      0.99    455833

Training Accuracy: 0.9944606906476714

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98    128431
           1       0.97      0.95      0.96     66927

    accuracy                           0.97    195358
   macro avg       0.97      0.97      0.97    195358
weighted avg       0.97      0.97      0.97    195358

Test Accuracy: 0.9728242508625191


KeyboardInterrupt: 

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def evaluate_model(y_true, y_pred):
    metrics = {
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
    }
    return metrics

metrics_model1 = evaluate_model(y_test, y_test_pred)
print("Model 1 Metrics:", metrics_model1)

Model 1 Metrics: {'Precision': np.float64(0.9807471470609345), 'Recall': np.float64(0.8166958028897157)}


In [38]:
##UNDERSAMPLING

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['pathDomainRatio', 'domainUrlRatio', 'tld', 
            'Entropy_Domain', 'domain_token_count', 'pathurlRatio', 'delimeter_Domain', 'SymbolCount_Domain', 
            'ldl_domain', 'avgdomaintokenlen', 'longdomaintokenlen', 'Domain_LongestWordLength', 'host_letter_count',
              'subDirLen', 'sub-Directory_LongestWordLength','avgpathtokenlen',
            'spcharUrl','SymbolCount_URL','Directory_LetterCount','ldl_path']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Encode categorical features
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Create binary labels for malicious and benign URLs
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Undersampling to balance classes
df_benign = df_cleaned[df_cleaned['binary_label'] == 0]
df_malicious = df_cleaned[df_cleaned['binary_label'] == 1]

# Downsample the benign samples to 50,000
df_benign_downsampled = resample(df_benign, 
                                 replace=False,     # sample without replacement
                                 n_samples=250000,   # number of samples to match
                                 random_state=42)   # for reproducibility

# Combine undersampled majority class with minority class
df_balanced = pd.concat([df_benign_downsampled, df_malicious])

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Define features and target for binary classification
X = df_balanced[numeric_features + ['tld_encoded']]
y = df_balanced['binary_label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Initialize and fit the Random Forest classifier for binary classification
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predictions
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)

# Binary Classification Report
print("Binary Classification Report - Training Data:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Multiclass Classification on malicious samples
malicious_df = df_balanced[df_balanced['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Initialize and fit Random Forest classifier for multiclass classification
rf_multiclass_classifier = RandomForestClassifier(random_state=42)
rf_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Predictions and Evaluations for Multiclass
y_train_pred_multi = rf_multiclass_classifier.predict(X_train_multi)
y_test_pred_multi = rf_multiclass_classifier.predict(X_test_multi)
print("Multiclass Classification Report (Training):")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Report (Test):")
print(classification_report(y_test_multi, y_test_pred_multi))

# Accuracy Summary
train_accuracy_bin = accuracy_score(y_train, y_train_pred)
test_accuracy_bin = accuracy_score(y_test, y_test_pred)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")



Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    175000
           1       0.97      0.97      0.97    156161

    accuracy                           0.98    331161
   macro avg       0.98      0.98      0.98    331161
weighted avg       0.98      0.98      0.98    331161

Training Accuracy: 0.9752748663037012

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     75000
           1       0.95      0.92      0.93     66927

    accuracy                           0.94    141927
   macro avg       0.94      0.94      0.94    141927
weighted avg       0.94      0.94      0.94    141927

Test Accuracy: 0.9386867896876563
Multiclass Classification Report (Training):
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     67520
           2       1.00      0.

In [20]:
##SMOTE

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['pathDomainRatio', 'domainUrlRatio', 'tld', 
            'Entropy_Domain', 'domain_token_count', 'pathurlRatio', 
            'delimeter_Domain', 'SymbolCount_Domain', 'ldl_domain', 
            'avgdomaintokenlen', 'longdomaintokenlen', 'Domain_LongestWordLength', 
            'host_letter_count', 'subDirLen', 'sub-Directory_LongestWordLength']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Encode categorical features
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Create binary labels for malicious and benign URLs
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Define features and target for binary classification
X_binary = df_cleaned[numeric_features + ['tld_encoded']]
y_binary = df_cleaned['binary_label']

# Apply SMOTE to the binary dataset
smote = SMOTE(random_state=42)
X_resampled_bin, y_resampled_bin = smote.fit_resample(X_binary, y_binary)

# Train-Test Split for Binary Classification
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_resampled_bin, y_resampled_bin, test_size=0.3, random_state=42, stratify=y_resampled_bin
)

# Initialize and fit the Random Forest classifier for binary classification
rf_classifier_bin = RandomForestClassifier(random_state=42)
rf_classifier_bin.fit(X_train_bin, y_train_bin)

# Predictions for Binary Classification
y_train_pred_bin = rf_classifier_bin.predict(X_train_bin)
y_test_pred_bin = rf_classifier_bin.predict(X_test_bin)

# Binary Classification Report
print("Binary Classification Report - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Training Accuracy:", accuracy_score(y_train_bin, y_train_pred_bin))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Test Accuracy:", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification using SMOTE
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']

# Apply SMOTE for multiclass data
X_resampled_multi, y_resampled_multi = smote.fit_resample(X_multi, y_multi)

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_resampled_multi, y_resampled_multi, test_size=0.3, random_state=42, stratify=y_resampled_multi
)

# Initialize and fit the Random Forest classifier for multiclass classification
rf_classifier_multi = RandomForestClassifier(random_state=42)
rf_classifier_multi.fit(X_train_multi, y_train_multi)

# Predictions for Multiclass Classification
y_train_pred_multi = rf_classifier_multi.predict(X_train_multi)
y_test_pred_multi = rf_classifier_multi.predict(X_test_multi)

print("Multiclass Classification Report - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Training Accuracy:", accuracy_score(y_train_multi, y_train_pred_multi))

print("\nMulticlass Classification Report - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Test Accuracy:", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of results
train_accuracy_bin = accuracy_score(y_train_bin, y_train_pred_bin)
test_accuracy_bin = accuracy_score(y_test_bin, y_test_pred_bin)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")


Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92    299672
           1       0.97      0.85      0.91    299672

    accuracy                           0.91    599344
   macro avg       0.92      0.91      0.91    599344
weighted avg       0.92      0.91      0.91    599344

Training Accuracy: 0.9115082490189274

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91    128431
           1       0.97      0.85      0.90    128431

    accuracy                           0.91    256862
   macro avg       0.91      0.91      0.91    256862
weighted avg       0.91      0.91      0.91    256862

Test Accuracy: 0.9079933972327553
Multiclass Classification Report - Training Data:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     67520
           2       1.00   