In [1]:
# Cleaned and corrected URL Classification Project (single file)

import pandas as pd
import numpy as np
import re
import pickle
from urllib.parse import urlparse
from tld import get_tld
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ------------------ Feature Functions ------------------
def having_ip_address(url):
    match = re.search(r'(([01]?\d\d?|2[0-4]\d|25[0-5])\.){3}([01]?\d\d?|2[0-4]\d|25[0-5])', url)
    return 1 if match else 0

def abnormal_url(url):
    hostname = urlparse(url).hostname
    return 1 if hostname and hostname in url else 0

def count_dot(url): return url.count('.')
def count_www(url): return url.count('www')
def count_atrate(url): return url.count('@')
def no_of_dir(url): return urlparse(url).path.count('/')
def no_of_embed(url): return urlparse(url).path.count('//')

def shortening_service(url):
    match = re.search(r'bit\.ly|tinyurl\.com|goo\.gl|t\.co|ow\.ly|buff\.ly|adf\.ly', url)
    return 1 if match else 0

def count_https(url): return url.count('https')
def count_http(url): return url.count('http')

def fd_length(url):
    path = urlparse(url).path
    try:
        return len(path.split('/')[1])
    except:
        return 0

def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

# ------------------ Load and Encode Data ------------------
df = pd.read_csv("malicious_phish.csv")  # Ensure this file exists
X_raw = df['url']
y = df['type']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ------------------ Feature Extraction ------------------
def extract_features(url):
    return [
        having_ip_address(url),
        abnormal_url(url),
        count_dot(url),
        count_www(url),
        count_atrate(url),
        no_of_dir(url),
        no_of_embed(url),
        shortening_service(url),
        count_https(url),
        count_http(url),
        fd_length(url),
        tld_length(get_tld(url, fail_silently=True))
    ]

X_features = X_raw.apply(extract_features).tolist()
X = np.array(X_features)

# ------------------ Train Model ------------------
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# ------------------ Save Model ------------------
with open("xgb_c.pkl", "wb") as f:
    pickle.dump(model, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# ------------------ Prediction Function ------------------
def predict_url_class(url):
    with open("xgb_c.pkl", "rb") as f:
        model = pickle.load(f)
    with open("label_encoder.pkl", "rb") as f:
        le = pickle.load(f)

    features = np.array(extract_features(url)).reshape(1, -1)
    pred = model.predict(features)[0]
    return le.inverse_transform([pred])[0]

# ------------------ Test Predictions ------------------
test_urls = [
    "https://www.google.com",
    "http://bit.ly/fakepaypal-login"
]

for url in test_urls:
    print(f"{url} --> {predict_url_class(url)}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


https://www.google.com --> phishing
http://bit.ly/fakepaypal-login --> phishing
