# Phishing Detection System using Rule-Based URL Analysis.

In [1]:
import requests
import whois
from datetime import datetime
from urllib.parse import urlparse
import re

# --- Configuration ---
# Any score above this threshold will be flagged as phishing.
RISK_THRESHOLD = 3

# --- Heuristic Checks ---

def get_domain_age(domain):
    """
    Checks the age of the domain in days.
    Newer domains are often considered riskier.
    """
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date
        
        # whois can return a list of dates, so we take the first one
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
            
        days_diff = (datetime.now() - creation_date).days
        print(f"  [INFO] Domain Age: {days_diff} days old.")
        # If the domain is less than 6 months old, it's a risk.
        return 1 if days_diff < 180 else 0
    except Exception as e:
        print(f"  [WARNING] Could not determine domain age. Error: {e}")
        # If we can't determine the age, we treat it as risky.
        return 1

def check_https(url):
    """
    Checks if the URL uses a secure HTTPS connection.
    Lack of HTTPS is a major red flag.
    """
    try:
        # We set a timeout to avoid waiting too long
        response = requests.get(url, timeout=5)
        # Check if the final URL after any redirects is HTTPS
        if response.url.startswith("https://"):
            print("  [INFO] HTTPS Status: Secure.")
            0
        else:
            print("  [WARNING] HTTPS Status: Insecure (HTTP).")
            return 1
    except requests.exceptions.RequestException:
        print("  [WARNING] Could not connect to the URL to verify HTTPS.")
        return 1

def check_url_patterns(url, parsed_url):
    """
    Checks for common suspicious patterns in the URL string itself.
    """
    risk_score = 0
    
    # 1. IP Address in Hostname: A big red flag.
    if re.match(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", parsed_url.netloc):
        risk_score += 1
        print("  [WARNING] Pattern: IP address used as hostname.")
        
    # 2. Presence of '@' symbol in the URL.
    if "@" in url:
        risk_score += 1
        print("  [WARNING] Pattern: '@' symbol found in URL.")

    # 3. Excessive number of dots (subdomains).
    if url.count('.') > 4:
        risk_score += 1
        print(f"  [WARNING] Pattern: Excessive dots ({url.count('.')}) found.")
        
    # 4. Presence of sensitive keywords.
    sensitive_keywords = ['secure', 'login', 'verify', 'account', 'update', 'signin']
    if any(keyword in url.lower() for keyword in sensitive_keywords):
        risk_score += 1
        print("  [WARNING] Pattern: Sensitive keywords found in URL.")
        
    return risk_score

# --- Main Analysis Function ---

def analyze_url(url):
    """
    Analyzes a URL by combining all heuristic checks and calculating a final risk score.
    """
    print("-" * 50)
    print(f"Analyzing URL: {url}")
    
    try:
        # Add 'http://' if no scheme is present for parsing
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        
        # If the domain is empty, it's an invalid URL
        if not domain:
            print("Invalid URL provided.")
            return

        total_risk_score = 0
        
        # Run each check and add to the total score
        total_risk_score += get_domain_age(domain)
        total_risk_score += check_https(url)
        total_risk_score += check_url_patterns(url, parsed_url)
        
        print(f"\nFinal Risk Score: {total_risk_score}")
        
        # Make the final verdict
        if total_risk_score >= RISK_THRESHOLD:
            print("Verdict: 🚨 PHISHING ATTEMPT 🚨")
        else:
            print("Verdict: ✅ Likely Safe ✅")
            
        print("-" * 50)

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# --- Example Usage ---
if __name__ == "__main__":
    # Example of a safe URL
    analyze_url("https://www.google.com")
    
    # Example of a potentially malicious URL
    analyze_url("http://google-secure-login.com/update-your-account")

    # Example with an IP address (very suspicious)
    analyze_url("http://192.168.1.1/paypal.com/login")

--------------------------------------------------
Analyzing URL: https://www.google.com
  [INFO] HTTPS Status: Secure.
An unexpected error occurred: unsupported operand type(s) for +=: 'int' and 'NoneType'
--------------------------------------------------
Analyzing URL: http://google-secure-login.com/update-your-account

Final Risk Score: 3
Verdict: 🚨 PHISHING ATTEMPT 🚨
--------------------------------------------------
--------------------------------------------------
Analyzing URL: http://192.168.1.1/paypal.com/login

Final Risk Score: 4
Verdict: 🚨 PHISHING ATTEMPT 🚨
--------------------------------------------------
