In [1]:
import re
import requests
from urllib.parse import urlparse

def is_suspicious_url(url):
    """
    Check if a URL is suspicious based on its structure.
    """
    # Regex for identifying long URLs or obfuscation techniques
    suspicious_patterns = [
        r"@",
        r"\.exe$",
        r"bit\.ly|goo\.gl|t\.co",  # Common URL shorteners
        r"([a-zA-Z0-9]+[.]){3}[a-zA-Z0-9]+",  # Multiple subdomains
        r"[0-9a-fA-F]{32,}"  # Potential hex patterns
    ]
    
    for pattern in suspicious_patterns:
        if re.search(pattern, url):
            return True
    return False

def check_domain_reputation(domain):
    """
    Check the domain against known phishing indicators (requires integration with reputation APIs).
    """
    # You can integrate this with external services such as VirusTotal or PhishTank.
    # For simplicity, we'll simulate basic checks.
    suspicious_keywords = ["login", "secure", "verify", "account", "update"]
    
    for keyword in suspicious_keywords:
        if keyword in domain.lower():
            return True
    return False

def is_valid_url(url):
    """
    Check if the URL is accessible and valid.
    """
    try:
        response = requests.get(url, timeout=5)
        # Check for a redirect, unusual status codes, etc.
        if response.status_code in [200, 301, 302]:
            return True
    except requests.RequestException:
        return False
    return False

def scan_url(url):
    """
    Scan the URL for phishing indicators.
    """
    print(f"Scanning URL: {url}")
    
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    # Check for structural red flags
    if is_suspicious_url(url):
        print("⚠️ Suspicious URL structure detected.")
        return "Phishing suspected"
    
    # Check domain reputation
    if check_domain_reputation(domain):
        print("⚠️ Domain contains suspicious keywords.")
        return "Phishing suspected"
    
    # Validate URL accessibility
    if not is_valid_url(url):
        print("⚠️ URL is unreachable or invalid.")
        return "Phishing suspected"
    
    print("✅ No immediate phishing signs detected.")
    return "Likely safe"

# Example usage
if __name__ == "__main__":
    test_urls = [
        "http://example.com",
        "https://www.google.com/",
        "https://www.whatsapp.com/",
        "http://192.168.1.1/login",
        "http://normalwebsite.com/safe-page"
    ]
    
    for test_url in test_urls:
        result = scan_url(test_url)
        print(f"Result: {result}\n")


Scanning URL: http://example.com
✅ No immediate phishing signs detected.
Result: Likely safe

Scanning URL: https://www.google.com/
✅ No immediate phishing signs detected.
Result: Likely safe

Scanning URL: https://www.whatsapp.com/
✅ No immediate phishing signs detected.
Result: Likely safe

Scanning URL: http://192.168.1.1/login
⚠️ Suspicious URL structure detected.
Result: Phishing suspected

Scanning URL: http://normalwebsite.com/safe-page
⚠️ URL is unreachable or invalid.
Result: Phishing suspected



In [None]:
pip install nbconvert
