In [3]:
import joblib
import pandas as pd
import re
import tldextract
from urllib.parse import urlparse
import ipaddress

In [4]:
# Load the saved model and feature selector
clf = joblib.load("phishing_model.pkl")
selector = joblib.load("feature_selector.pkl")
feature_columns = joblib.load("feature_columns.pkl")  # Load feature names

In [5]:
def has_ip(url):
    """Check if URL contains an IP address."""
    try:
        ipaddress.ip_address(urlparse(url).netloc)
        return 1
    except ValueError:
        return 0

In [6]:
def count_special_chars(url):
    """Count special characters in URL."""
    return len(re.findall(r'[./?\-_@=]', url))

In [7]:
def extract_features(url_list):
    """Extract necessary features from URLs."""
    df = pd.DataFrame({'url': url_list})
    df['url_length'] = df['url'].apply(len)
    df['num_special_chars'] = df['url'].apply(count_special_chars)
    df['num_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
    df['num_letters'] = df['url'].apply(lambda x: sum(c.isalpha() for c in x))
    df['num_subdomains'] = df['url'].apply(lambda x: len(tldextract.extract(x).subdomain.split('.')))
    df['has_ip'] = df['url'].apply(has_ip)
    df['uses_https'] = df['url'].apply(lambda x: 1 if urlparse(x).scheme == 'https' else 0)
    df['tld'] = df['url'].apply(lambda x: tldextract.extract(x).suffix)

    # Keep only the top 10 most frequent TLDs, others as 'other'
    top_tlds = ['com', 'org', 'net', 'edu', 'gov', 'co', 'info', 'biz', 'xyz', 'cn']
    df['tld'] = df['tld'].apply(lambda x: x if x in top_tlds else 'other')

    # One-hot encoding for TLDs
    df = pd.get_dummies(df, columns=['tld'], drop_first=True)

    # Align with training features
    df = df.reindex(columns=feature_columns, fill_value=0)

    return df


In [8]:
def predict_urls(url_list):
    """Predict whether the given URLs are phishing or legitimate."""
    test_features = extract_features(url_list)
    test_features = selector.transform(test_features)  # Apply feature selection
    predictions = clf.predict(test_features)
    return dict(zip(url_list, predictions))

In [9]:
# Example Test URLs
urls_to_test = [
    "https://www.apple.com",
    "http://fake-bank-login.com",
    "https://secure.amazon.com",
    "http://paypal-verification-login.com"
]

In [10]:
results = predict_urls(urls_to_test)
print("\nPrediction Results:")
for url, pred in results.items():
    print(f"{url}: {'Phishing' if pred == 1 else 'Legitimate'}")


Prediction Results:
https://www.apple.com: Legitimate
http://fake-bank-login.com: Phishing
https://secure.amazon.com: Legitimate
http://paypal-verification-login.com: Phishing
