In [111]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import math
import re
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from collections import defaultdict

# 1. Enhanced Feature Extraction (unchanged)
def extract_url_features(url):
    """Extracts phishing detection features from any URL"""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.split(':')[0]
        
        features = {
            'url_length': len(url),
            'domain_has_ip': int(bool(re.match(r'^\d+\.\d+\.\d+\.\d+$', domain))),
            'num_special_chars': sum(1 for c in url if c in '/:?&%=.-_~@'),
            'num_digits': sum(c.isdigit() for c in url),
            'domain_length': len(domain),
            'subdomain_length': len(domain.split('.')[0]),
            'num_subdomains': len(domain.split('.')) - 1,
            'is_common_tld': int(domain.endswith(('.com', '.org', '.net', '.gov'))),
            'typosquatting': int(any(t in domain for t in ['paypa1', 'g00gle', 'amaz0n'])),
            'has_banking_kw': int(any(kw in url.lower() for kw in ['login', 'bank', 'account', 'secure'])),
            'has_hex': int(bool(re.search(r'%[0-9a-fA-F]{2}', url))),
            'has_at_symbol': int('@' in url),
            'uses_https': int(parsed.scheme == 'https'),
            'path_depth': parsed.path.count('/'),
            'entropy': -sum((url.count(c)/len(url)) * math.log2(url.count(c)/len(url)) 
                       for c in set(url) if url.count(c) > 0),
            'vowel_ratio': sum(1 for c in domain if c.lower() in 'aeiou') / len(domain) if domain else 0,
            'consecutive_chars': int(bool(re.search(r'([a-zA-Z])\1{2}', domain))),
        }
        return features
    except:
        return None

# 2. Phishing Detector Class with Dynamic Domain Learning
class PhishingDetector:
    def __init__(self, model_path=None):
        self.model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', LGBMClassifier(
                n_estimators=200,
                max_depth=7,
                class_weight='balanced',
                random_state=42
            ))
        ])
        self.trusted_domains = set()  # Will be learned from data
        self.domain_reputation = defaultdict(int)
        
        if model_path:
            try:
                saved_data = joblib.load(model_path)
                self.model = saved_data['model']
                self.trusted_domains = set(saved_data.get('trusted_domains', []))
                print("Loaded pre-trained model with dynamic domain knowledge")
            except:
                print("Could not load model, using untrained model")

    def _extract_base_domain(self, url):
        """Extracts base domain (e.g. 'google.com' from 'mail.google.com')"""
        domain = urlparse(url).netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        parts = domain.split('.')
        if len(parts) > 2:
            return f"{parts[-2]}.{parts[-1]}"
        return domain

    def train(self, X, y):
        """Train model and learn trusted domains"""
        # Extract domains from training data
        domains = [self._extract_base_domain(url) for url in X.index]
        legit_domains = [d for d, label in zip(domains, y) if label == 0]
        
        # Calculate domain frequencies
        domain_counts = pd.Series(legit_domains).value_counts()
        
        # Automatically identify trusted domains (top 100 legitimate domains)
        self.trusted_domains = set(domain_counts.head(100).index)
        print(f"Learned {len(self.trusted_domains)} trusted domains from training data")
        
        # Train the model
        self.model.fit(X, y)

    def save_model(self, path):
        """Save model with learned domain knowledge"""
        joblib.dump({
            'model': self.model,
            'trusted_domains': list(self.trusted_domains)
        }, path)

    def is_trusted_domain(self, url):
        """Dynamic domain verification"""
        domain = self._extract_base_domain(url)
        return domain in self.trusted_domains

    def predict(self, url):
        """Predict with dynamic domain analysis"""
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
            
        domain = self._extract_base_domain(url)
        
        # Dynamic trusted domain check
        if self.is_trusted_domain(url):
            return {
                'url': url,
                'is_phishing': False,
                'confidence': 'high',
                'message': f'Automatically verified domain: {domain}',
                'probability': 0.01,  # Very low probability for trusted domains
                'indicators': [f'Trusted domain pattern: {domain}']
            }

        features = extract_url_features(url)
        if not features:
            return {
                'url': url,
                'is_phishing': None,
                'error': 'Could not extract features',
                'confidence': 'low'
            }

        try:
            features_df = pd.DataFrame([features])
            proba = self.model.predict_proba(features_df)[0][1]
            
            # Adjust confidence based on domain characteristics
            if domain.endswith(('.gov', '.edu', '.mil')):
                proba *= 0.5  # Reduce suspicion for government/education domains
                
            is_phishing = proba > 0.85
            
            indicators = []
            if features['domain_has_ip']:
                indicators.append("Uses IP address instead of domain")
            if features['typosquatting']:
                indicators.append("Contains typosquatting patterns")
            if features['entropy'] > 3.5:
                indicators.append(f"High randomness (entropy: {features['entropy']:.2f})")
            
            return {
                'url': url,
                'is_phishing': bool(is_phishing),
                'probability': float(proba),
                'confidence': 'high' if proba > 0.9 or proba < 0.1 else 'medium',
                'domain': domain,
                'indicators': indicators if indicators else ['No strong indicators']
            }
        except Exception as e:
            return {
                'url': url,
                'is_phishing': None,
                'error': str(e),
                'confidence': 'low'
            }

# 3. Main Execution (updated to track domains)
if __name__ == "__main__":
    detector = PhishingDetector()
    
    try:
        print("Loading dataset...")
        data = pd.read_csv('C:/Users/msi/OneDrive/Bureau/CyberIA/StealthPhisher2025.csv')
        
        # Auto-detect columns
        url_col = next((col for col in data.columns if 'url' in col.lower()), 'URL')
        label_col = next((col for col in data.columns if col.lower() in ['label', 'phishing']), 'Label')
        
        print(f"Using columns: URLs='{url_col}', Labels='{label_col}'")
        
        # Convert labels
        label_mapping = {
            'legitimate': 0, 'phishing': 1, 'safe': 0, 'malicious': 1,
            '0': 0, '1': 1, 'false': 0, 'true': 1
        }
        y = data[label_col].astype(str).str.lower().map(label_mapping)
        
        if y.isna().any():
            invalid_labels = data[label_col][y.isna()].unique()
            raise ValueError(f"Found unmapped labels: {invalid_labels}")
        
        # Extract features and track URLs for domain learning
        print("Extracting features...")
        features = []
        valid_indices = []
        url_index = []  # To keep track of original URLs
        
        for i, url in enumerate(data[url_col]):
            feat = extract_url_features(str(url))
            if feat:
                features.append(feat)
                valid_indices.append(i)
                url_index.append(url)
        
        X = pd.DataFrame(features, index=url_index)  # Store URLs in index
        y = y.iloc[valid_indices].astype(int)
        
        print(f"Successfully processed {len(X)} URLs")
        print("Label distribution:")
        print(y.value_counts())
        
        # Train model (will automatically learn domains)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print("\nTraining model...")
        detector.train(X_train, y_train)
        detector.save_model('phishing_model.pkl')
        print("Model trained and saved")
        
        # Show some learned domains
        print("\nSample of automatically learned trusted domains:")
        print(list(detector.trusted_domains)[:10])
        
        # Interactive testing
        print("\n" + "="*50)
        print("PHISHING DETECTOR READY FOR INPUT")
        print("="*50)
        
        while True:
            url = input("\nEnter URL to check (or 'quit'): ").strip()
            if url.lower() == 'quit':
                break
                
            result = detector.predict(url)
            print("\n=== Analysis Result ===")
            print(f"URL: {result['url']}")
            print(f"Domain: {result.get('domain', 'N/A')}")
            print("Status:", "PHISHING" if result.get('is_phishing') else "SAFE")
            print(f"Confidence: {result.get('confidence', 'unknown')}")
            if 'probability' in result:
                print(f"Probability: {result['probability']:.2%}")
            print("Key Indicators:")
            for indicator in result.get('indicators', ['No strong indicators']):
                print(f"- {indicator}")
                
    except Exception as e:
        print(f"\nERROR: {str(e)}")
        print("\nTroubleshooting Guide:")
        print("1. Verify CSV file exists and is accessible")
        print("2. Check column headers match expected format")
        print("3. Ensure URLs are properly formatted")

Loading dataset...
Using columns: URLs='URL', Labels='Label'
Extracting features...
Successfully processed 336749 URLs
Label distribution:
Label
1    175806
0    160943
Name: count, dtype: int64

Training model...
Learned 100 trusted domains from training data
Model trained and saved

Sample of automatically learned trusted domains:
['ac.in', 'gov.in', 'bel.tr', 'co.jp', 'go.jp', 'gov.uk', 'co.th', 'net.au', 'com.my', 'com.uy']

PHISHING DETECTOR READY FOR INPUT



Enter URL to check (or 'quit'):  https://www.facebook.com/



=== Analysis Result ===
URL: https://www.facebook.com/
Domain: facebook.com
Status: PHISHING
Confidence: high
Probability: 99.96%
Key Indicators:
- High randomness (entropy: 3.83)



Enter URL to check (or 'quit'):  https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset/



=== Analysis Result ===
URL: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset/
Domain: kaggle.com
Status: PHISHING
Confidence: high
Probability: 99.99%
Key Indicators:
- High randomness (entropy: 4.17)



Enter URL to check (or 'quit'):  quit
