In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
import tldextract
from urllib.parse import urlparse
import re
import pickle
import os

In [14]:
# Feature extraction function (same as used for training)
def extract_url_features(url):
    features = {}
    
    # Parse URL components
    parsed = urlparse(url)
    extracted = tldextract.extract(url)
    
    # Basic length features
    features['url_length'] = len(url)
    features['domain_length'] = len(extracted.domain)
    features['path_length'] = len(parsed.path)
    
    # Domain-specific features
    features['subdomain_length'] = len(extracted.subdomain)
    features['tld_length'] = len(extracted.suffix) if extracted.suffix else 0
    
    # Character distribution
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_letters'] = sum(c.isalpha() for c in url)
    features['num_special'] = len(url) - features['num_digits'] - features['num_letters']
    
    # Special character counts
    features['count_dots'] = url.count('.')
    features['count_hyphens'] = url.count('-')
    features['count_underscores'] = url.count('_')
    features['count_slashes'] = url.count('/')
    features['count_equals'] = url.count('=')
    features['count_at'] = url.count('@')
    features['count_and'] = url.count('&')
    features['count_question'] = url.count('?')
    features['count_percent'] = url.count('%')
    features['count_plus'] = url.count('+')
    features['count_asterisk'] = url.count('*')
    features['count_exclamation'] = url.count('!')
    
    # Binary features
    features['has_ip_address'] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0
    features['has_http'] = 1 if 'http://' in url else 0
    features['has_https'] = 1 if 'https://' in url else 0
    features['has_port'] = 1 if re.search(r':\d+', url) else 0
    # features['has_suspicious_words'] = 1 if re.search(r'(login|bank|account|secure|update|confirm)', url.lower()) else 0
    
    # # Ratios and derived features
    # features['digits_to_letters_ratio'] = features['num_digits'] / features['num_letters'] if features['num_letters'] > 0 else 0
    # features['special_to_total_ratio'] = features['num_special'] / len(url) if len(url) > 0 else 0
    
    return features

In [15]:
def load_model(model_path):
    """Load the trained XGBoost model"""
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")
    
    try:
        model = pickle.load(open(model_path, 'rb'))
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def load_scaler(scaler_path):
    """Load the fitted StandardScaler"""
    if not os.path.exists(scaler_path):
        raise FileNotFoundError(f"Scaler file not found at {scaler_path}")
    
    try:
        scaler = pickle.load(open(scaler_path, 'rb'))
        return scaler
    except Exception as e:
        print(f"Error loading scaler: {e}")
        return None

In [10]:
def predict_urls(urls, model, scaler=None):
    """
    Predict multiple URLs using the trained model
    
    Args:
        urls (list): List of URLs to predict
        model: Trained XGBoost model
        scaler: Fitted StandardScaler (optional)
        
    Returns:
        DataFrame with URLs and predictions
    """
    results = []
    
    for url in urls:
        # Extract features
        features = extract_url_features(url)
        features_df = pd.DataFrame([features])
        
        # Scale features if scaler is provided
        if scaler:
            features_scaled = scaler.transform(features_df)
            # Make prediction
            prediction = model.predict(features_scaled)[0]
            probability = model.predict_proba(features_scaled)[0]
        else:
            # Make prediction without scaling
            prediction = model.predict(features_df)[0]
            probability = model.predict_proba(features_df)[0]
        
        # Store results
        results.append({
            'url': url,
            'prediction': int(prediction),
            'label': 'Legitimate' if prediction == 1 else 'Malicious',
            'confidence': probability[1] if prediction == 1 else probability[0]
        })
    
    return pd.DataFrame(results)

In [16]:
# Interactive testing script
if __name__ == "__main__":
    # Paths to saved model and scaler
    MODEL_PATH = 'xgboost_url_classifier.pkl'  # Update with your model path
    SCALER_PATH = 'url_scaler.pkl'  # Update with your scaler path
    
    # Check if model exists, if not, ask user
    if not os.path.exists(MODEL_PATH):
        MODEL_PATH = input("Enter the path to your XGBoost model file (.pkl): ")
    
    # Load model
    model = load_model(MODEL_PATH)
    if model is None:
        print("Failed to load model. Exiting.")
        exit(1)
    
    # Try to load scaler if it exists
    scaler = None
    if os.path.exists(SCALER_PATH):
        scaler = load_scaler(SCALER_PATH)
        print("Scaler loaded successfully.")
    else:
        print("No scaler found. Proceeding without scaling.")

Scaler loaded successfully.


In [20]:
# Test modes
print("\nURL Classifier Testing Tool")
print("===========================")
print("1. Test individual URLs")
print("2. Test URLs from a file")
choice = input("Select option (1/2): ")

if choice == '1':
    # Individual URL testing
    urls_to_test = []
    while True:
        url = input("\nEnter a URL to test (or 'done' to finish): ")
        if url.lower() == 'done':
            break
        urls_to_test.append(url)
    
    if not urls_to_test:
        print("No URLs provided. Exiting.")
        exit(0)
        
    # Make predictions
    results = predict_urls(urls_to_test, model, scaler)
    
    # Display results
    print("\nPrediction Results:")
    print("------------------")
    for i, row in results.iterrows():
        print(f"URL: {row['url']}")
        print(f"Prediction: {row['label']} (Class {row['prediction']})")
        print(f"Confidence: {row['confidence']:.4f}")
        print("------------------")
        
elif choice == '2':
    # File-based testing
    file_path = input("Enter path to file containing URLs (one URL per line): ")
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        exit(1)
        
    # Read URLs from file
    try:
        with open(file_path, 'r') as f:
            urls_to_test = [line.strip() for line in f if line.strip()]
    except Exception as e:
        print(f"Error reading file: {e}")
        exit(1)
        
    if not urls_to_test:
        print("No URLs found in file. Exiting.")
        exit(0)
        
    print(f"Loaded {len(urls_to_test)} URLs from file.")
    
    # Make predictions
    results = predict_urls(urls_to_test, model, scaler)
    
    # Display summary
    legitimate_count = sum(results['prediction'] == 1)
    malicious_count = sum(results['prediction'] == 0)
    
    print("\nPrediction Summary:")
    print(f"Total URLs: {len(results)}")
    print(f"Legitimate URLs: {legitimate_count} ({legitimate_count/len(results)*100:.2f}%)")
    print(f"Malicious URLs: {malicious_count} ({malicious_count/len(results)*100:.2f}%)")
    
    # Save results to CSV
    output_file = f"url_prediction_results_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv"
    results.to_csv(output_file, index=False)
    print(f"\nDetailed results saved to {output_file}")
    
    # Show a few examples
    print("\nSample predictions:")
    print(results.head(10))
    
else:
    print("Invalid choice. Exiting.")


URL Classifier Testing Tool
1. Test individual URLs
2. Test URLs from a file
File not found: 
Error reading file: [Errno 2] No such file or directory: ''
Loaded 7 URLs from file.

Prediction Summary:
Total URLs: 7
Legitimate URLs: 0 (0.00%)
Malicious URLs: 7 (100.00%)

Detailed results saved to url_prediction_results_20250321_102338.csv

Sample predictions:
                                                url  prediction      label  \
0                             https://www.apple.com           0  Malicious   
1                            https://www.amazon.com           0  Malicious   
2                          https://www.facebook.com           0  Malicious   
3                   http://amazon-security-check.ga           0  Malicious   
4  http://facebook.com-user-login-authenticate.info           0  Malicious   
5         https://secure.google.accounts-signin.xyz           0  Malicious   
6                                               doe           0  Malicious   

   confidence 

In [18]:
print(model)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
              max_leaves=None, min_child_weight=1, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, ...)
