In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['pathDomainRatio', 'domainUrlRatio', 'tld', 
            'Entropy_Domain', 'domain_token_count', 'pathurlRatio', 'delimeter_Domain', 
            'SymbolCount_Domain', 'ldl_domain', 'avgdomaintokenlen', 'longdomaintokenlen', 
            'Domain_LongestWordLength', 'host_letter_count',
              'subDirLen', 'sub-Directory_LongestWordLength','avgpathtokenlen',
            'spcharUrl','SymbolCount_URL','Directory_LetterCount','ldl_path']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

X = df_cleaned[numeric_features + ['tld_encoded']]
y = df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize and fit the Random Forest classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# Predictions
y_train_pred = xgb_classifier.predict(X_train)
y_test_pred = xgb_classifier.predict(X_test)

# Classification reports
print("Binary Classification Report - Training Data:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Multiclass Classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded'] - malicious_df['url_type_encoded'].min()
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)
xgb_multi_classifier = XGBClassifier(random_state=42)
xgb_multi_classifier.fit(X_train_multi, y_train_multi)

# Predictions and Evaluations for Multiclass
y_train_pred_multi = xgb_multi_classifier.predict(X_train_multi)
y_test_pred_multi = xgb_multi_classifier.predict(X_test_multi)
print("Multiclass Classification Report (Training):")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Report (Test):")
print(classification_report(y_test_multi, y_test_pred_multi))

# Accuracy Summary
train_accuracy_bin = accuracy_score(y_train, y_train_pred)
test_accuracy_bin = accuracy_score(y_test, y_test_pred)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")

#Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def evaluate_model(y_true, y_pred):
    metrics = {
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
    }
    return metrics

metrics_model1 = evaluate_model(y_test, y_test_pred)
print("Model 1 Metrics:", metrics_model1)

Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96    299672
           1       0.97      0.87      0.91    156161

    accuracy                           0.94    455833
   macro avg       0.95      0.93      0.94    455833
weighted avg       0.95      0.94      0.94    455833

Training Accuracy: 0.9438917322791461

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96    128431
           1       0.96      0.87      0.91     66927

    accuracy                           0.94    195358
   macro avg       0.95      0.92      0.93    195358
weighted avg       0.94      0.94      0.94    195358

Test Accuracy: 0.942633524094227
Multiclass Classification Report (Training):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     67520
           1       0.99      0.9

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['pathDomainRatio', 'domainUrlRatio', 'tld', 
            'Entropy_Domain', 'domain_token_count', 'pathurlRatio', 'delimeter_Domain', 
            'SymbolCount_Domain', 'ldl_domain', 'avgdomaintokenlen', 'longdomaintokenlen', 
            'Domain_LongestWordLength', 'host_letter_count',
              'subDirLen', 'sub-Directory_LongestWordLength','avgpathtokenlen',
            'spcharUrl','SymbolCount_URL','Directory_LetterCount','ldl_path']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

numeric_features = [f for f in features if f not in ['tld', 'url_type']]
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()
df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

X = df_cleaned[numeric_features + ['tld_encoded']]
y = df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize and fit the Random Forest classifier
xgb_classifier = XGBClassifier(random_state=42)

# Hyperparameter tuning using Grid Search
random_params = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 5, 10],
}

xgb_classifier = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
xgb_classifier.fit(X_train, y_train)

# Predictions
y_train_pred = xgb_classifier.predict(X_train)
y_test_pred = xgb_classifier.predict(X_test)

# Classification reports
print("Binary Classification Report - Training Data:")
print(classification_report(y_train, y_train_pred))
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))

print("\nBinary Classification Report - Test Data:")
print(classification_report(y_test, y_test_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

# Multiclass Classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded'] - malicious_df['url_type_encoded'].min()
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)
xgb_multi_classifier = XGBClassifier(random_state=42)
xgb_multi_classifier = RandomizedSearchCV(estimator=xgb_multi_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
xgb_multi_classifier.fit(X_train_multi, y_train_multi)

# Predictions and Evaluations for Multiclass
y_train_pred_multi = xgb_multi_classifier.predict(X_train_multi)
y_test_pred_multi = xgb_multi_classifier.predict(X_test_multi)
print("Multiclass Classification Report (Training):")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Report (Test):")
print(classification_report(y_test_multi, y_test_pred_multi))

# Accuracy Summary
train_accuracy_bin = accuracy_score(y_train, y_train_pred)
test_accuracy_bin = accuracy_score(y_test, y_test_pred)
train_accuracy_multi = accuracy_score(y_train_multi, y_train_pred_multi)
test_accuracy_multi = accuracy_score(y_test_multi, y_test_pred_multi)

print(f"Binary Classification - Train Accuracy: {train_accuracy_bin:.4f}")
print(f"Binary Classification - Test Accuracy: {test_accuracy_bin:.4f}")
print(f"Multiclass Classification - Train Accuracy: {train_accuracy_multi:.4f}")
print(f"Multiclass Classification - Test Accuracy: {test_accuracy_multi:.4f}")


Binary Classification Report - Training Data:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    299672
           1       0.97      0.93      0.95    156161

    accuracy                           0.96    455833
   macro avg       0.96      0.95      0.96    455833
weighted avg       0.96      0.96      0.96    455833

Training Accuracy: 0.9637389131545983

Binary Classification Report - Test Data:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96    128431
           1       0.95      0.90      0.93     66927

    accuracy                           0.95    195358
   macro avg       0.95      0.94      0.94    195358
weighted avg       0.95      0.95      0.95    195358

Test Accuracy: 0.9501888839975839
Multiclass Classification Report (Training):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     67520
           1       1.00      0.

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def evaluate_model(y_true, y_pred):
    metrics = {
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
    }
    return metrics

metrics_model1 = evaluate_model(y_test, y_test_pred)
print("Model 1 Metrics:", metrics_model1)

Model 1 Metrics: {'Precision': np.float64(0.9828551875664091), 'Recall': np.float64(0.8154407040506821)}
