<h3>Machine Learning Model</h3>

1. Binary Classification
   To check if benign (0) or malicious (1)
2. Multiclass Classification
   To Diferentiate different malicious URL

<h3>RF</h3>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv')

# Select features and target columns
features = ['path_token_count', 'avgpathtokenlen', 'SymbolCount_URL', 'SymbolCount_FileName', 'tld', 'urlLen']

# Clean the dataset by removing NaNs and infs
df_cleaned = df[np.isfinite(df[features]).all(1)].copy()  # Create an explicit copy

# Label encoding for TLD (since it's categorical)
df_cleaned['tld_encoded'] = LabelEncoder().fit_transform(df_cleaned['tld'])

# Combine all features
X = df_cleaned[features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['URL_Type_obf_Type'].apply(lambda x: 0 if x == 'benign' else 1)

# Step 1: Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, df_cleaned['binary_label'], test_size=0.3, random_state=42)

# Binary Classifier: RandomForest
rf_binary_classifier = RandomForestClassifier(random_state=42)
rf_binary_classifier.fit(X_train_bin, y_train_bin)

# Binary Classification Evaluation (Training Data)
y_train_pred_bin = rf_binary_classifier.predict(X_train_bin)
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Binary Classification Evaluation (Test Data)
y_test_pred_bin = rf_binary_classifier.predict(X_test_bin)
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Step 2: Multiclass Classification (Only on Malicious URLs)
# Filter only malicious URLs for multiclass classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()  # Create an explicit copy
X_malicious = X[df_cleaned['binary_label'] == 1]

# Label encoding for the type of malicious URL
le_malicious_type = LabelEncoder()
malicious_df['malicious_type_encoded'] = le_malicious_type.fit_transform(malicious_df['URL_Type_obf_Type'])

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_malicious, malicious_df['malicious_type_encoded'], test_size=0.3, random_state=42)

# Multiclass Classifier: RandomForest
rf_multiclass_classifier = RandomForestClassifier(random_state=42)
rf_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Multiclass Classification Evaluation (Training Data)
y_train_pred_multi = rf_multiclass_classifier.predict(X_train_multi)
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Multiclass Classification Evaluation (Test Data)
y_test_pred_multi = rf_multiclass_classifier.predict(X_test_multi)
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Print overall summary
print("\nOverall Accuracy Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")

  df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv')


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

<h3>KNN</h3>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\All.csv')

# Select features and target columns
features = ['path_token_count', 'avgpathtokenlen', 'SymbolCount_URL', 'SymbolCount_FileName', 'tld', 'urlLen']

# Clean the dataset by removing NaNs and infs
df_cleaned = df[np.isfinite(df[features]).all(1)].copy()  # Create an explicit copy

# Label encoding for TLD (since it's categorical)
df_cleaned['tld_encoded'] = LabelEncoder().fit_transform(df_cleaned['tld'])

# Combine all features
X = df_cleaned[features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['URL_Type_obf_Type'].apply(lambda x: 0 if x == 'benign' else 1)

# Step 1: Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, df_cleaned['binary_label'], test_size=0.3, random_state=42)

# Binary Classifier: KNN
knn_binary_classifier = KNeighborsClassifier(n_neighbors=5)  # You can change n_neighbors based on your needs
knn_binary_classifier.fit(X_train_bin, y_train_bin)

# Binary Classification Evaluation (Training Data)
y_train_pred_bin = knn_binary_classifier.predict(X_train_bin)
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Binary Classification Evaluation (Test Data)
y_test_pred_bin = knn_binary_classifier.predict(X_test_bin)
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Step 2: Multiclass Classification (Only on Malicious URLs)
# Filter only malicious URLs for multiclass classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()  # Create an explicit copy
X_malicious = X[df_cleaned['binary_label'] == 1]

# Label encoding for the type of malicious URL
le_malicious_type = LabelEncoder()
malicious_df['malicious_type_encoded'] = le_malicious_type.fit_transform(malicious_df['URL_Type_obf_Type'])

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_malicious, malicious_df['malicious_type_encoded'], test_size=0.3, random_state=42)

# Multiclass Classifier: KNN
knn_multiclass_classifier = KNeighborsClassifier(n_neighbors=5)  # You can change n_neighbors based on your needs
knn_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Multiclass Classification Evaluation (Training Data)
y_train_pred_multi = knn_multiclass_classifier.predict(X_train_multi)
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Multiclass Classification Evaluation (Test Data)
y_test_pred_multi = knn_multiclass_classifier.predict(X_test_multi)
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Print overall summary
print("\nOverall Accuracy Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")


Binary Classification Report (Benign vs Malicious) - Training Data:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      5448
           1       0.97      0.98      0.98     20050

    accuracy                           0.97     25498
   macro avg       0.96      0.94      0.95     25498
weighted avg       0.97      0.97      0.97     25498

Binary Classification Accuracy (Training): 0.9662326456977017

Binary Classification Report (Benign vs Malicious) - Test Data:
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      2326
           1       0.96      0.97      0.97      8603

    accuracy                           0.95     10929
   macro avg       0.93      0.92      0.92     10929
weighted avg       0.95      0.95      0.95     10929

Binary Classification Accuracy (Test): 0.9492176777381279

Multiclass Classification Report (Malicious Type) - Training Data:
              precision  

<h3>XGBoost</h3>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv')

# Select features and target columns
features = ['path_token_count', 'avgpathtokenlen', 'SymbolCount_URL', 'SymbolCount_FileName', 'tld', 'urlLen']

# Clean the dataset by removing NaNs and infs
df_cleaned = df[np.isfinite(df[features]).all(1)].copy()  # Create an explicit copy

# Label encoding for TLD (since it's categorical)
df_cleaned['tld_encoded'] = LabelEncoder().fit_transform(df_cleaned['tld'])

# Combine all features
X = df_cleaned[features + ['tld_encoded']]

# Binary classification: Benign vs Malicious
df_cleaned['binary_label'] = df_cleaned['URL_Type_obf_Type'].apply(lambda x: 0 if x == 'benign' else 1)

# Step 1: Train-Test Split for Binary Classification (Benign vs Malicious)
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, df_cleaned['binary_label'], test_size=0.3, random_state=42)

# Binary Classifier: XGBoost
xgb_binary_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_binary_classifier.fit(X_train_bin, y_train_bin)

# Binary Classification Evaluation (Training Data)
y_train_pred_bin = xgb_binary_classifier.predict(X_train_bin)
print("Binary Classification Report (Benign vs Malicious) - Training Data:")
print(classification_report(y_train_bin, y_train_pred_bin))
print("Binary Classification Accuracy (Training):", accuracy_score(y_train_bin, y_train_pred_bin))

# Binary Classification Evaluation (Test Data)
y_test_pred_bin = xgb_binary_classifier.predict(X_test_bin)
print("\nBinary Classification Report (Benign vs Malicious) - Test Data:")
print(classification_report(y_test_bin, y_test_pred_bin))
print("Binary Classification Accuracy (Test):", accuracy_score(y_test_bin, y_test_pred_bin))

# Step 2: Multiclass Classification (Only on Malicious URLs)
# Filter only malicious URLs for multiclass classification
malicious_df = df_cleaned[df_cleaned['binary_label'] == 1].copy()  # Create an explicit copy
X_malicious = X[df_cleaned['binary_label'] == 1]

# Label encoding for the type of malicious URL
le_malicious_type = LabelEncoder()
malicious_df['malicious_type_encoded'] = le_malicious_type.fit_transform(malicious_df['URL_Type_obf_Type'])

# Train-Test Split for Multiclass Classification
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_malicious, malicious_df['malicious_type_encoded'], test_size=0.3, random_state=42)

# Multiclass Classifier: XGBoost
xgb_multiclass_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_multiclass_classifier.fit(X_train_multi, y_train_multi)

# Multiclass Classification Evaluation (Training Data)
y_train_pred_multi = xgb_multiclass_classifier.predict(X_train_multi)
print("\nMulticlass Classification Report (Malicious Type) - Training Data:")
print(classification_report(y_train_multi, y_train_pred_multi))
print("Multiclass Classification Accuracy (Training):", accuracy_score(y_train_multi, y_train_pred_multi))

# Multiclass Classification Evaluation (Test Data)
y_test_pred_multi = xgb_multiclass_classifier.predict(X_test_multi)
print("\nMulticlass Classification Report (Malicious Type) - Test Data:")
print(classification_report(y_test_multi, y_test_pred_multi))
print("Multiclass Classification Accuracy (Test):", accuracy_score(y_test_multi, y_test_pred_multi))

# Print overall summary
print("\nOverall Accuracy Summary:")
print(f"Binary Classification - Training Accuracy: {accuracy_score(y_train_bin, y_train_pred_bin):.4f}")
print(f"Binary Classification - Test Accuracy: {accuracy_score(y_test_bin, y_test_pred_bin):.4f}")
print(f"Multiclass Classification - Training Accuracy: {accuracy_score(y_train_multi, y_train_pred_multi):.4f}")
print(f"Multiclass Classification - Test Accuracy: {accuracy_score(y_test_multi, y_test_pred_multi):.4f}")


  df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\malicious_2021.csv')


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

<h3>Testing</h3>

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle

# Load your original dataset
df = pd.read_csv(r'C:\Users\arell\Documents\1_ALF\data\All.csv')

# Select features and target columns
features = ['path_token_count', 'avgpathtokenlen', 'SymbolCount_URL', 'SymbolCount_FileName', 'tld', 'urlLen']

# Clean the dataset by removing NaNs and infs
df_cleaned = df[np.isfinite(df[features]).all(1)].copy()

# Create and fit the TLD encoder
tld_encoder = LabelEncoder()
df_cleaned['tld_encoded'] = tld_encoder.fit_transform(df_cleaned['tld'])

# Saving the TLD encoder
with open('tld_encoder.pkl', 'wb') as f:
    pickle.dump(tld_encoder, f)

print("TLD encoder has been created and saved.")

# Create and fit the Malicious Type encoder
# Assuming 'URL_Type_obf_Type' is the column name for malicious types
malicious_type_encoder = LabelEncoder()
df_cleaned['malicious_type_encoded'] = malicious_type_encoder.fit_transform(df_cleaned['URL_Type_obf_Type'])

# Saving the Malicious Type encoder
with open('malicious_type_encoder.pkl', 'wb') as f:
    pickle.dump(malicious_type_encoder, f)

print("Malicious Type encoder has been created and saved.")

# To load the encoders later, you can use:
# with open('tld_encoder.pkl', 'rb') as f:
#     tld_encoder = pickle.load(f)
# with open('malicious_type_encoder.pkl', 'rb') as f:
#     malicious_type_encoder = pickle.load(f)

# Example of how to use the encoders in your prediction function
def predict_url(url, binary_model, multiclass_model, tld_encoder, malicious_type_encoder):
    # Transform URL to features
    features_df = transform_url_to_features(url, tld_encoder)
    
    # Binary classification
    binary_prediction = binary_model.predict(features_df)[0]
    
    if binary_prediction == 0:
        return 'Benign'
    else:
        # Multiclass classification
        malicious_type_encoded = multiclass_model.predict(features_df)[0]
        malicious_type = malicious_type_encoder.inverse_transform([malicious_type_encoded])[0]
        return f'Malicious: {malicious_type}'

# Make sure you have this function defined
def transform_url_to_features(url, tld_encoder):
    # ... (implementation as before)
    pass

# The rest of your prediction script remains the same

TLD encoder has been created and saved.
Malicious Type encoder has been created and saved.


In [5]:
import re
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
import pickle

# Assuming your trained models are already loaded
# rf_binary_classifier = ...
# rf_multiclass_classifier = ...
# We also need the LabelEncoder used for 'tld' in the training process
# tld_encoder = ...

# Loading the encoders
with open('tld_encoder.pkl', 'rb') as f:
    tld_encoder = pickle.load(f)

with open('malicious_type_encoder.pkl', 'rb') as f:
    malicious_type_encoder = pickle.load(f)

# Feature extraction functions
def get_path_token_count(url):
    parsed_url = urlparse(url)
    path_tokens = parsed_url.path.split('/')
    return len([token for token in path_tokens if token])

def get_avg_path_token_len(url):
    parsed_url = urlparse(url)
    path_tokens = parsed_url.path.split('/')
    non_empty_tokens = [token for token in path_tokens if token]
    return np.mean([len(token) for token in non_empty_tokens]) if non_empty_tokens else 0

def get_symbol_count_url(url):
    return len(re.findall(r'[!@#$%^&*()_+={}\[\];:\'",.<>?/\\|`~]', url))

def get_symbol_count_filename(url):
    parsed_url = urlparse(url)
    filename = parsed_url.path.split('/')[-1]
    return len(re.findall(r'[!@#$%^&*()_+={}\[\];:\'",.<>?/\\|`~]', filename))

def get_url_length(url):
    return len(url)

def get_tld(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.split('.')
    return domain[-1] if len(domain) > 1 else ''

# Transform URL to feature vector
def transform_url_to_features(url, tld_encoder):
    features = {
        'path_token_count': get_path_token_count(url),
        'avgpathtokenlen': get_avg_path_token_len(url),
        'SymbolCount_URL': get_symbol_count_url(url),
        'SymbolCount_FileName': get_symbol_count_filename(url),
        'urlLen': get_url_length(url),
        'tld': get_tld(url)
    }
    df = pd.DataFrame([features])
    
    # Encode the 'tld' using the same encoder as in training
    df['tld_encoded'] = tld_encoder.transform([features['tld']])[0]
    
    # Ensure the order of features matches the training data
    return df[['path_token_count', 'avgpathtokenlen', 'SymbolCount_URL', 'SymbolCount_FileName', 'tld', 'urlLen', 'tld_encoded']]

# Predict function for a single URL
def predict_url(url, binary_model, multiclass_model, tld_encoder, malicious_type_encoder):
    # Step 1: Extract features
    features_df = transform_url_to_features(url, tld_encoder)
    
    # Step 2: Binary classification (benign or malicious)
    binary_prediction = binary_model.predict(features_df)[0]
    
    if binary_prediction == 0:
        return 'Benign'
    else:
        # Step 3: Multiclass classification (malicious type)
        malicious_type_encoded = multiclass_model.predict(features_df)[0]
        malicious_type = malicious_type_encoder.inverse_transform([malicious_type_encoded])[0]
        return f'Malicious: {malicious_type}'

# Example usage:
url = "http://hollywoodlife.com/2014/05/01/rihanna-iheartradio-music-awards-dress-2014-pics/"

# Assuming models and encoders are loaded
result = predict_url(url, rf_binary_classifier, rf_multiclass_classifier, tld_encoder, malicious_type_encoder)
print(result)

# To use this in your script, you need to:
# 1. Load the trained models: rf_binary_classifier and rf_multiclass_classifier
# 2. Load or recreate the LabelEncoder for 'tld' used during training
# 3. Load or recreate the LabelEncoder for malicious types used during training
# Then you can use the predict_url function as shown in the example usage

ValueError: invalid literal for int() with base 10: np.str_('com')