In [5]:
import pandas as pd
import numpy as np
import re
import joblib

# Load the trained model and scaler
clf = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

# Define descriptive class names
class_names = ["Malicious", "Benign"]

# Function to extract features from a single URL
def extract_features_from_url(url):
    """Extract features for a single URL."""
    features = {
        'having_ip_address': int(bool(re.search(r'\d+\.\d+\.\d+\.\d+', url))),
        'url_length': len(url),
        'shortining_service': 0,  # Placeholder (e.g., use an external service if needed)
        'having_at_symbol': int('@' in url),
        'double_slash_redirecting': 0,  # Placeholder
        'prefix_suffix': int('-' in url),
        'sslfinal_state': 1,  # Assume SSL is valid
        'domain_registration_length': 1,  # Placeholder
        'favicon': 1,  # Placeholder
        'port': 0,  # Placeholder
        'https_token': int(url.startswith('https'))
    }
    return pd.DataFrame([features])

# Main function for URL classification
def classify_url(url):
    """Classify a URL using the trained model."""
    # Extract features from the URL
    url_features = extract_features_from_url(url)

    # Scale features using the pre-fitted scaler
    url_features_scaled = scaler.transform(url_features)

    # Predict the class and probabilities
    prediction = clf.predict(url_features_scaled)
    probabilities = clf.predict_proba(url_features_scaled)

    # Map prediction to class name
    predicted_class = class_names[prediction[0]]

    # Return classification results
    return {
        "url": url,
        "predicted_class": predicted_class,
        "probabilities": {
            class_names[0]: probabilities[0][0],
            class_names[1]: probabilities[0][1]
        }
    }

if __name__ == "__main__":
    # Accept URL input from the user
    user_url = input("Enter a URL to classify: ")
    results = classify_url(user_url)

    # Print results
    print("\nURL Classification Results:")
    print(f"Entered URL: {results['url']}")
    print(f"Predicted Class: {results['predicted_class']}")
    print("Probability Scores:")
    for class_name, score in results['probabilities'].items():
        print(f"  {class_name}: {score:.4f}")



URL Classification Results:
Entered URL: https://bis-usonic.eu/components/com_cmc/models/iui/DHLAUTO/dhl.php?rand=13InboxLightaspxn.1774256418&amp;fid.4.1252899642&amp;fid=1&amp;fav.1&amp;rand.13InboxLight.aspxn.1774256418&amp;fid.1252899642&amp;fid.1&amp;fav.1&amp;email=&amp;.rand=13InboxLight.aspx?n=1774256418&amp;fid=4#n=1252899642&amp;fid=1&amp;fav=1
Predicted Class: Benign
Probability Scores:
  Malicious: 0.1300
  Benign: 0.8700
