In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import tldextract
from urllib.parse import urlparse
import re

In [2]:
# Feature extraction function
def extract_url_features(url):
    features = {}
    
    # Parse URL components
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    
    # Basic length features
    features['url_length'] = len(url)
    features['domain_length'] = len(extracted.domain)
    features['path_length'] = len(parsed.path)
    
    # Domain-specific features
    features['subdomain_length'] = len(extracted.subdomain)
    features['tld_length'] = len(extracted.suffix) if extracted.suffix else 0
    
    # Character distribution
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_letters'] = sum(c.isalpha() for c in url)
    features['num_special'] = len(url) - features['num_digits'] - features['num_letters']
    
    # Special character counts
    features['count_dots'] = url.count('.')
    features['count_hyphens'] = url.count('-')
    features['count_underscores'] = url.count('_')
    features['count_slashes'] = url.count('/')
    features['count_equals'] = url.count('=')
    features['count_at'] = url.count('@')
    features['count_and'] = url.count('&')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_plus'] = url.count('+')
    features['count_asterisk'] = url.count('*')
    features['count_exclamation'] = url.count('!')
    
    # Binary features
    features['has_ip_address'] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0
    features['has_http'] = 1 if 'http://' in url else 0
    features['has_https'] = 1 if 'https://' in url else 0
    features['has_port'] = 1 if re.search(r':\d+', url) else 0
    # features['has_suspicious_words'] = 1 if re.search(r'(login|bank|account|secure|update|confirm)', url.lower()) else 0
    
    # Ratios and derived features
    # features['digits_to_letters_ratio'] = features['num_digits'] / features['num_letters'] if features['num_letters'] > 0 else 0
    # features['special_to_total_ratio'] = features['num_special'] / len(url) if len(url) > 0 else 0
    
    return features

In [3]:
df = pd.read_csv('Dataset/new_data_urls.csv')
# Display top 10 rows for legitimate websites (status == 1)
print("Top 10 Legitimate URLs:")
print(df[df['status'] == 1].head(10))

# Display top 10 rows for phishing websites (status == 0)
print("\nTop 10 Phishing URLs:")
print(df[df['status'] == 0].head(10))

Top 10 Legitimate URLs:
                                                      url  status
178574              http://www.crestonwood.com/router.php       1
178577                                 http://rgipt.ac.in       1
178578  http://www.iracing.com/tracks/gateway-motorspo...       1
178580                                http://www.mutuo.it       1
178582         http://vamoaestudiarmedicina.blogspot.com/       1
178583  https://parade.com/425836/joshwigler/the-amazi...       1
178584  https://www.astrologyonline.eu/Astro_MemoNew/P...       1
178585        https://www.lifewire.com/tcp-port-21-818146       1
178586  https://technofizi.net/top-best-mp3-downloader...       1
178588                          https://www.missfiga.com/       1

Top 10 Phishing URLs:
                                       url  status
0  0000111servicehelpdesk.godaddysites.com       0
1     000011accesswebform.godaddysites.com       0
2                             00003.online       0
3      0009servicedesko

In [4]:
# 2. Extract features
feature_list = []
for url in df['url']:
    feature_list.append(extract_url_features(url))
    
features_df = pd.DataFrame(feature_list)

# 3. Prepare for training
X = features_df
y = df['status']  # Your 0/1 labels

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Scale features (optional but often helps)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
# 6. Train XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=7,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

In [6]:
# 7. Evaluate the model
y_pred = xgb_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93     78996
           1       0.92      0.96      0.94     85406

    accuracy                           0.93    164402
   macro avg       0.93      0.93      0.93    164402
weighted avg       0.93      0.93      0.93    164402

[[71654  7342]
 [ 3819 81587]]


In [7]:
# 8. Analyze feature importance
importance = xgb_model.feature_importances_
feature_names = X.columns
for i, name in enumerate(feature_names):
    print(f"{name}: {importance[i]}")

url_length: 0.010885360650718212
domain_length: 0.015982823446393013
path_length: 0.0694599598646164
subdomain_length: 0.0262621957808733
tld_length: 0.05527009442448616
num_digits: 0.03192368522286415
num_letters: 0.007082695607095957
num_special: 0.015294201672077179
count_dots: 0.04220788925886154
count_hyphens: 0.029430454596877098
count_underscores: 0.01815984770655632
count_slashes: 0.14481313526630402
count_equals: 0.013218333013355732
count_at: 0.017123932018876076
count_and: 0.004110482055693865
count_question: 0.018638961017131805
count_percent: 0.007857674732804298
count_plus: 0.030784843489527702
count_asterisk: 0.0035704609472304583
count_exclamation: 0.0017810596618801355
has_ip_address: 0.006044727750122547
has_http: 0.14285334944725037
has_https: 0.2792623043060303
has_port: 0.007981590926647186


In [8]:
import pickle
# After training your model
pickle.dump(xgb_model, open('xgboost_url_classifier.pkl', 'wb'))
# If you used a scaler
pickle.dump(scaler, open('url_scaler.pkl', 'wb'))