In [1]:
import quadprog
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# The helper function. Do not change it
def quadprog_solve_qp(P, q, G=None, h=None, A=None, b=None):
    qp_G = .5 * (P + P.T)  # make sure P is symmetric
    qp_a = -q
    if A is not None:
        qp_C = -np.vstack([A, G]).T
        qp_b = -np.hstack([b, h])
        meq = A.shape[0]
    else:  # no equality constraint
        qp_C = -G.T
        qp_b = -h
        meq = 0
    return quadprog.solve_qp(qp_G, qp_a, qp_C, qp_b, meq)[0]


In [2]:

# Set seed for reproducibility
np.random.seed(42)

# Load your data
data = pd.read_csv('processed_urls.csv')

print("Original data shape:", data.shape)
print("Original columns:", data.columns.tolist())

# Specify your label column name
label_column = 'type'  # Replace with your actual label column name

# Separate features and labels
Y = data[label_column].values

# Get only numeric columns (excluding the label column)
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()

# Remove label column from numeric columns if it's there
if label_column in numeric_columns:
    numeric_columns.remove(label_column)

print(f"\nNumeric feature columns found: {numeric_columns}")
print(f"Number of numeric features: {len(numeric_columns)}")


  data = pd.read_csv('processed_urls.csv')


Original data shape: (504933, 35)
Original columns: ['url', 'type', 'scheme', 'subdomain', 'registrable_domain', 'suffix', 'path', 'query', 'fragment', 'port', 'username', 'password', 'host', 'is_http', 'is_https', 'len_total', 'len_host', 'len_path', 'len_query', 'len_fragment', 'count_dots', 'count_slashes', 'count_digits', 'count_hyphen', 'count_underscore', 'count_percent', 'count_at', 'count_question', 'count_equal', 'count_ampersand', 'count_special', 'entropy_url', 'keyword_flag', 'is_shortened', 'is_ip_host']

Numeric feature columns found: ['port', 'is_http', 'is_https', 'len_total', 'len_host', 'len_path', 'len_query', 'len_fragment', 'count_dots', 'count_slashes', 'count_digits', 'count_hyphen', 'count_underscore', 'count_percent', 'count_at', 'count_question', 'count_equal', 'count_ampersand', 'count_special', 'entropy_url', 'keyword_flag', 'is_shortened', 'is_ip_host']
Number of numeric features: 23


In [None]:
#extract numeric features
X = data[numeric_columns].values

#handle missing values
if np.isnan(X).any():
    print("\nFilling missing values with zeros...")
    inds = np.where(np.isnan(X))
    X[inds] = 0.0

print(f"\nFeature matrix shape: {X.shape}")



Feature matrix shape: (504933, 23)

Feature matrix shape: (504933, 23)


In [None]:
#check labels for binary classification
unique_labels = np.unique(Y)
print(f"Original unique labels: {unique_labels}")

if len(unique_labels) != 2:
    raise ValueError(f"Expected 2 classes, but found {len(unique_labels)}: {unique_labels}")

#adjust labels to -1 and 1
label_mapping = {unique_labels[0]: -1, unique_labels[1]: 1}
Y = np.array([label_mapping[label] for label in Y])

print(f"Mapped labels: {unique_labels[0]} -> -1, {unique_labels[1]} -> 1")
print(f"Class distribution: Class -1: {np.sum(Y == -1)}, Class 1: {np.sum(Y == 1)}")


Original unique labels: [0 1]
Mapped labels: 0 -> -1, 1 -> 1
Class distribution: Class -1: 345738, Class 1: 159195


In [5]:
# Train/Test split (70/30)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, train_size=0.7, test_size=0.3, random_state=42, stratify=Y
)

# Limit training set to 5000 samples if larger
if len(X_train) > 5000:
    indices = np.random.choice(len(X_train), 5000, replace=False)
    X_train = X_train[indices]
    Y_train = Y_train[indices]

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 5000
Test set size: 151480


In [None]:
#get number of training samples
n_samples = X_train.shape[0]

#build Q matrix 
print("\nBuilding Q matrix...")
start_time = time.time()
Q = np.zeros((n_samples, n_samples))
end_time = time.time()
q_matrix_time = end_time - start_time


for i in range(Q.shape[0]):
    for j in range(Q.shape[1]):
        Q[i, j] = Y_train[i] * Y_train[j] * np.dot(X_train[i], X_train[j])

print('Q shape = ', Q.shape)

#calculate rest of parameters for QP
P = Q + np.eye(n_samples) * 1e-5
q = -np.ones(n_samples)
G = -np.eye(n_samples)
h = np.zeros(n_samples)
A = Y_train.reshape((1, n_samples))
b = np.zeros(1)



Building Q matrix...
Q shape =  (5000, 5000)
Q shape =  (5000, 5000)


In [None]:
print('Training SVM...')
start_time = time.time()
solution = quadprog_solve_qp(P, q, G, h, A, b)
end_time = time.time()
training_time = end_time - start_time

print('solution shape = ', solution.shape)
print('Number of support vectors: ', np.sum(solution > 1e-5))

#get support vectors
support_vector_indices = solution > 1e-5
support_vectors = X_train[support_vector_indices]
support_vector_labels = Y_train[support_vector_indices]
alphas = solution[support_vector_indices]

print(f'Support vectors shape: {support_vectors.shape}')

#calculate w (weight vector) for linear SVM
w = np.sum((alphas * support_vector_labels).reshape(-1, 1) * support_vectors, axis=0)
print('w shape = ', w.shape)

#calculate b (bias term)
b = support_vector_labels[0] - np.dot(w, support_vectors[0])
print('b = ', b)

Training SVM...
solution shape =  (5000,)
Number of support vectors:  2161
Support vectors shape: (2161, 23)
w shape =  (23,)
b =  -14.907554995707327
solution shape =  (5000,)
Number of support vectors:  2161
Support vectors shape: (2161, 23)
w shape =  (23,)
b =  -14.907554995707327


In [None]:
#prediction function
def predict(X_data, w, b):
    """Predict labels for data points"""
    decision = np.dot(X_data, w) + b
    return np.sign(decision)

#predict on test set
Y_test_pred = predict(X_test, w, b)

In [None]:
#calculate metrics
from sklearn.metrics import precision_score, recall_score, f1_score

test_accuracy = accuracy_score(Y_test, Y_test_pred)
test_precision = precision_score(Y_test, Y_test_pred, average='weighted')
test_recall = recall_score(Y_test, Y_test_pred, average='weighted')
test_f1 = f1_score(Y_test, Y_test_pred, average='weighted')

print('\n' + '='*50)
print('TEST RESULTS')
print('='*50)
print(f'Accuracy:  {test_accuracy:.4f}')
print(f'Precision: {test_precision:.4f}')
print(f'Recall:    {test_recall:.4f}')
print(f'F1 Score:  {test_f1:.4f}')
print(f'Q Matrix found in {q_matrix_time:.4f} seconds ({q_matrix_time/60:.2f} minutes)')
print(f'Training completed in {training_time:.4f} seconds ({training_time/60:.2f} minutes)')


TEST RESULTS
Accuracy:  0.9032
Precision: 0.9141
Recall:    0.9032
F1 Score:  0.8979
Training completed in 628.4081 seconds (10.47 minutes)
