In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import xgboost as xgb
import joblib
import numpy as np

In [2]:
# Load the updated dataset
url_dataset = pd.read_csv("../split_urls.csv")
url_dataset

Unnamed: 0,category,protocol,domain,path,query,fragment
0,malicious,,,br-icloud.com.br,,
1,benign,,,mp3raid.com/music/krizz_kaliko.html,,
2,benign,,,bopsecrets.org/rexroth/cr/1.htm,,
3,malicious,http,www.garage-pirenne.be,/index.php,option=com_content&view=article&id=70&vsig70_0=15,
4,malicious,http,adventure-nicaragua.net,/index.php,option=com_mailto&tmpl=component&link=aHR0cDov...,
...,...,...,...,...,...,...
651186,malicious,,,xbox360.ign.com/objects/850/850402.html,,
651187,malicious,,,games.teamxbox.com/xbox-360/1860/Dead-Space/,,
651188,malicious,,,www.gamespot.com/xbox360/action/deadspace/,,
651189,malicious,,,en.wikipedia.org/wiki/Dead_Space_(video_game),,


In [3]:
# Replace empty cells with whitespace
url_dataset = url_dataset.fillna("")

In [4]:
# Extract URLs and labels from the dataset
all_urls = url_dataset['protocol'] + "://" + url_dataset['domain'] + url_dataset['path'] + url_dataset['query'] + url_dataset['fragment']
labels = np.array((url_dataset['category'] == 'malicious').astype(int)).reshape(-1, 1)

In [5]:
# Use LabelEncoder to convert string labels to numerical format
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

  y = column_or_1d(y, warn=True)


In [6]:
# Create a TfidfVectorizer for feature extraction
vectorizer = TfidfVectorizer(binary=True)  # You can use TfidfVectorizer as well
X = vectorizer.fit_transform(all_urls)

In [7]:
# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [8]:
# Train an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

In [9]:
# Predict labels for the test set
y_pred = xgb_classifier.predict(X_test)

In [10]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

In [11]:
# Display evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.9421601824338331
Precision: 0.9430782012286097
Recall: 0.9421601824338331
F1-Score: 0.9413768654336122


In [None]:
# Save the trained model and Vectorizer to files
model_filename = "Models/cv_xgb.pkl"
vectorizer_filename = "Vectorizers/vectorizer_cv_xgb.pkl"
joblib.dump(xgb_classifier, model_filename)
joblib.dump(vectorizer, vectorizer_filename)
label_encoder_filename = "Label Encoders/label_encoder_cv_xgb.pkl"
joblib.dump(label_encoder, label_encoder_filename)