In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset, handle mixed types by coercing invalid entries to NaN for column 6
converted_features_df = pd.read_csv('C:\\Users\\octliva\\NIDS\\converted_features.csv', low_memory=False)

# Coerce column 6 to numeric, converting invalid entries to NaN
converted_features_df.iloc[:, 6] = pd.to_numeric(converted_features_df.iloc[:, 6], errors='coerce')

# Separate features (X) and target (y)
X = converted_features_df.drop('url_type', axis=1)
y = converted_features_df['url_type']

# Identify categorical features
categorical_features = X.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical features
X = pd.get_dummies(X, columns=categorical_features)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after SMOTE
print("Class distribution before SMOTE:", Counter(y))
print("Class distribution after SMOTE:", Counter(y_resampled))

# Define hyperparameter grid for Decision Tree
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

# 1. Benign vs. Malicious Classification

# Create a binary target variable (1 for malicious, 0 for benign)
y_binary = [1 if label != 'benign' else 0 for label in y_resampled]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_binary, test_size=0.3, random_state=42)

# Initialize Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best estimator after hyperparameter tuning
best_dt_classifier = grid_search.best_estimator_

# Make predictions
y_pred = best_dt_classifier.predict(X_test)

# Evaluate the model
print("Benign vs. Malicious Classification:")
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 2. Type of Malicious Classification

# Filter out benign URLs from the resampled data
X_malicious = X_resampled[[label != 'benign' for label in y_resampled]]
y_malicious = [label for label in y_resampled if label != 'benign']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_malicious, y_malicious, test_size=0.2, random_state=42)

# Hyperparameter tuning for type of malicious classification
grid_search.fit(X_train, y_train)

# Best estimator after hyperparameter tuning
best_dt_classifier = grid_search.best_estimator_

# Make predictions
y_pred = best_dt_classifier.predict(X_test)

# Evaluate the model
print("\nType of Malicious Classification:")
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\octliva\\NIDS\\converted_features.csv'

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.stats import randint

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'url_type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()

# Convert 'tld' and 'url_type' to string since they're categorical
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

# Select only numeric features for checking infinite values
numeric_features = [f for f in features if f not in ['tld', 'url_type']]

# Apply np.isfinite only to numeric features and filter rows
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD and url_type (since they're categorical)
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()

df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Combine all features (numeric + 'tld_encoded') for the model
X = df_cleaned[numeric_features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

# Initialize the Random Forest classifier for binary classification
dt_classifier = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
grid_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=dt_classifier, param_grid=grid_params, cv=3, n_jobs=-1)
grid_search.fit(X_train_bin, y_train_bin)

# Best parameters from GridSearch
print("Best parameters for binary classification:", grid_search.best_params_)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = grid_search.predict(X_train_bin)
y_test_pred_bin = grid_search.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Use the 'url_type_encoded' column for the target
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Adjust the multiclass labels so that they start from 0
y_train_multi = y_train_multi - 1
y_test_multi = y_test_multi - 1

# Initialize the Random Forest classifier for multiclass classification
rf_multiclass_classifier = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
random_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(estimator=rf_multiclass_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train_multi, y_train_multi)

# Best parameters from RandomizedSearch
print("Best parameters for multiclass classification:", random_search.best_params_)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = random_search.predict(X_train_multi)
y_test_pred_multi = random_search.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))


Best parameters for binary classification: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
Binary Classification Report (Benign vs Malicious) - Training Data:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98    299672
           1       0.98      0.96      0.97    156161

    accuracy                           0.98    455833
   macro avg       0.98      0.97      0.98    455833
weighted avg       0.98      0.98      0.98    455833

Binary Classification Accuracy (Training): 0.9781323423271242

Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    128431
           1       0.96      0.94      0.95     66927

    accuracy                           0.97    195358
   macro avg       0.97      0.96      0.97    195358
weighted avg       0.97      0.97      0.97    195358

Binary Classific

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.stats import randint

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv', low_memory=False)

# Select features and target columns
features = ['Querylength', 'domain_token_count', 'path_token_count',
            'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
            'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
            'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
            'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
            'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
            'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
            'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
            'ISIpAddressInDomainName', 'CharacterContinuityRate',
            'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
            'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
            'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
            'Directory_LetterCount', 'Filename_LetterCount',
            'Extension_LetterCount', 'Query_LetterCount', 'LongestPathTokenLength',
            'Domain_LongestWordLength', 'Path_LongestWordLength',
            'sub-Directory_LongestWordLength', 'Arguments_LongestWordLength',
            'URL_sensitiveWord', 'URLQueries_variable', 'spcharUrl',
            'delimeter_Domain', 'delimeter_path', 'delimeter_Count',
            'NumberRate_URL', 'NumberRate_Domain', 'NumberRate_DirectoryName',
            'NumberRate_FileName', 'NumberRate_Extension', 'NumberRate_AfterPath',
            'SymbolCount_URL', 'SymbolCount_Domain', 'SymbolCount_Directoryname',
            'SymbolCount_FileName', 'SymbolCount_Extension',
            'SymbolCount_Afterpath', 'Entropy_URL', 'Entropy_Domain',
            'Entropy_DirectoryName', 'Entropy_Filename', 'Entropy_Extension',
            'Entropy_Afterpath', 'url_type']

# Clean the dataset by removing NaNs and infinities in numeric columns only
df_cleaned = df.copy()

# Convert 'tld' and 'url_type' to string since they're categorical
df_cleaned['tld'] = df_cleaned['tld'].astype(str)
df_cleaned['url_type'] = df_cleaned['url_type'].astype(str)

# Select only numeric features for checking infinite values
numeric_features = [f for f in features if f not in ['tld', 'url_type']]

# Apply np.isfinite only to numeric features and filter rows
df_cleaned = df_cleaned[np.isfinite(df_cleaned[numeric_features]).all(axis=1)]

# Label encoding for TLD and url_type (since they're categorical)
label_encoder_tld = LabelEncoder()
label_encoder_url_type = LabelEncoder()

df_cleaned['tld_encoded'] = label_encoder_tld.fit_transform(df_cleaned['tld'])
df_cleaned['url_type_encoded'] = label_encoder_url_type.fit_transform(df_cleaned['url_type'])

# Combine all features (numeric + 'tld_encoded') for the model
X = df_cleaned[numeric_features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['url_type'].apply(lambda x: 0 if x == 'benign' else 1)

# Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, df_cleaned['binary_label'], test_size=0.3, random_state=42, stratify=df_cleaned['binary_label']
)

#SMOTE
smote = SMOTE(random_state=42)
X_train_bin, y_train_bin = smote.fit_resample(X_train_bin, y_train_bin)

# Initialize the Random Forest classifier for binary classification
dt_classifier = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
grid_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=dt_classifier, param_grid=grid_params, cv=3, n_jobs=-1)
grid_search.fit(X_train_bin, y_train_bin)

# Best parameters from GridSearch
print("Best parameters for binary classification:", grid_search.best_params_)

# Make predictions on the training and test data for binary classification
y_train_pred_bin = grid_search.predict(X_train_bin)
y_test_pred_bin = grid_search.predict(X_test_bin)

# Evaluate the model on the training data for binary classification
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Evaluate the model on the test data for binary classification
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Multiclass Classification for types of malicious URLs
# Only consider the non-benign entries
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()

# Use the 'url_type_encoded' column for the target
X_multi = malicious_df[numeric_features + ['tld_encoded']]
y_multi = malicious_df['url_type_encoded']

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.3, random_state=42, stratify=y_multi
)

# Adjust the multiclass labels so that they start from 0
y_train_multi = y_train_multi - 1
y_test_multi = y_test_multi - 1

# Initialize the Random Forest classifier for multiclass classification
rf_multiclass_classifier = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
random_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(estimator=rf_multiclass_classifier, param_distributions=random_params, n_iter=50, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train_multi, y_train_multi)

# Best parameters from RandomizedSearch
print("Best parameters for multiclass classification:", random_search.best_params_)

# Make predictions on the training and test data for multiclass classification
y_train_pred_multi = random_search.predict(X_train_multi)
y_test_pred_multi = random_search.predict(X_test_multi)

# Evaluate the model on the training data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Evaluate the model on the test data for multiclass classification
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Summary of overall results
print("\nOverall Results Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

# Show encoded url_type
print("\nEncoded URL Types:")
print(malicious_df[['url_type', 'url_type_encoded']].drop_duplicates().reset_index(drop=True))


Best parameters for binary classification: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 20}
Binary Classification Report (Benign vs Malicious) - Training Data:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98    299672
           1       0.99      0.98      0.98    299672

    accuracy                           0.98    599344
   macro avg       0.98      0.98      0.98    599344
weighted avg       0.98      0.98      0.98    599344

Binary Classification Accuracy (Training): 0.981765063135695

Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97    128431
           1       0.95      0.95      0.95     66927

    accuracy                           0.97    195358
   macro avg       0.96      0.96      0.96    195358
weighted avg       0.97      0.97      0.97    195358

Binary Classifica