In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import joblib
import numpy as np

In [2]:
# Load the updated dataset
url_dataset = pd.read_csv("../split_urls.csv")
url_dataset

In [3]:
# Replace empty cells with whitespace
url_dataset = url_dataset.fillna("")

Total number of rows in the CSV: 651191


In [4]:
# Extract URLs and labels from the dataset
all_urls = url_dataset['protocol'] + "://" + url_dataset['domain'] + url_dataset['path'] + url_dataset['query'] + url_dataset['fragment']
labels = np.array((url_dataset['category'] == 'malicious').astype(int)).reshape(-1, 1)

In [5]:
# Create a TfidfVectorizer for one-hot encoding
vectorizer = TfidfVectorizer(binary=True)
X = vectorizer.fit_transform(all_urls)

In [None]:
# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [None]:
# Train a Decision Tree classifier
clf = SVC(random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Predict labels for the test set
y_pred = clf.predict(X_test)

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

In [None]:
# Display evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")