# PrivInspect ML Model Training - Complete Setup Guide

This notebook provides a complete end-to-end setup for training and deploying the PrivInspect domain ranking model on any fresh device.

## Prerequisites

- Python 3.8+ installed
- Internet connection (for downloading dependencies and TrackerRadar data)
- ~2GB free disk space for data and model files

## What This Notebook Does

1. **Installs all dependencies** automatically
2. **Downloads DuckDuckGo TrackerRadar data** (51,198 domains)
3. **Trains ML model** with aggressive privacy detection
4. **Tests model performance** on known tracking/legitimate domains
5. **Verifies integration** with FastAPI backend
6. **Saves trained models** for production use

## Expected Runtime

- Fresh install: ~10-15 minutes
- Model training: ~5-10 minutes
- Total: ~20-25 minutes

# Step 1: Install Dependencies

Automatically installs all required Python packages for machine learning, data processing, and visualization. This ensures the notebook works on any fresh device without manual setup.

In [None]:
# Step 1: Install all required dependencies
import subprocess
import sys
import os

def install_package(package_name, import_name=None):
    """Install a package and verify it can be imported"""
    if import_name is None:
        import_name = package_name
    
    try:
        __import__(import_name)
        print(f"‚úÖ {package_name} already installed")
        return True
    except ImportError:
        print(f"‚è≥ Installing {package_name}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            print(f"‚úÖ {package_name} installed successfully")
            return True
        except subprocess.CalledProcessError as e:
            print(f"‚ùå Failed to install {package_name}: {e}")
            return False

# Core dependencies for ML and data processing
packages = [
    ("pandas", "pandas"),
    ("numpy", "numpy"), 
    ("scikit-learn", "sklearn"),
    ("lightgbm", "lightgbm"),
    ("matplotlib", "matplotlib"),
    ("seaborn", "seaborn"),
    ("requests", "requests"),
    ("joblib", "joblib"),
    ("tqdm", "tqdm")
]

print("üîß Installing required packages...")
failed_packages = []

for package_name, import_name in packages:
    success = install_package(package_name, import_name)
    if not success:
        failed_packages.append(package_name)

if failed_packages:
    print(f"‚ùå Failed to install: {failed_packages}")
    print("Please install these manually and rerun the notebook")
else:
    print("‚úÖ All packages installed successfully!")
    
# Verify Python version
print(f"\nüêç Python version: {sys.version}")
print(f"üìÅ Working directory: {os.getcwd()}")

# Try to import all packages to verify installation
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import sklearn
    import lightgbm as lgb
    import requests
    import joblib
    from tqdm import tqdm
    print("‚úÖ All imports successful!")
except ImportError as e:
    print(f"‚ùå Import failed: {e}")
    print("Please restart the notebook kernel and try again")

# Step 2: Project Setup and Data Download

Creates the necessary project directory structure and defines the TrackerRadar data download functionality. This sets up everything needed to fetch and organize the DuckDuckGo TrackerRadar dataset.

In [None]:
# Step 2: Setup project structure and create training module
import os
import json
import requests
from pathlib import Path
from typing import Dict, List, Optional, Union
import warnings
warnings.filterwarnings('ignore')

# Create necessary directories
directories = ['../data', '../models', '../scripts']
for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)
    print(f"üìÅ Created/verified directory: {dir_path}")

# Define the complete training module inline (for standalone operation)
class TrackerRadarParser:
    """Parser for DuckDuckGo TrackerRadar data"""
    
    def __init__(self, base_url="https://github.com/duckduckgo/tracker-radar/archive/refs/heads/main.zip"):
        self.base_url = base_url
        self.data_dir = Path("../data")
        
    def download_tracker_radar(self) -> Dict:
        """Download and parse TrackerRadar data"""
        import zipfile
        import tempfile
        
        print("üì• Downloading DuckDuckGo TrackerRadar data...")
        
        # Download the zip file
        response = requests.get(self.base_url)
        response.raise_for_status()
        
        # Extract to temporary directory
        with tempfile.TemporaryDirectory() as temp_dir:
            zip_path = Path(temp_dir) / "tracker-radar.zip"
            with open(zip_path, 'wb') as f:
                f.write(response.content)
            
            # Extract zip
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(temp_dir)
            
            # Find the extracted folder
            extracted_folders = [d for d in Path(temp_dir).iterdir() if d.is_dir() and 'tracker-radar' in d.name]
            if not extracted_folders:
                raise ValueError("Could not find tracker-radar folder in downloaded zip")
            
            tracker_radar_path = extracted_folders[0]
            
            # Copy to our data directory
            import shutil
            target_path = self.data_dir / "tracker-radar"
            if target_path.exists():
                shutil.rmtree(target_path)
            shutil.copytree(tracker_radar_path, target_path)
            
            print(f"‚úÖ TrackerRadar data downloaded to {target_path}")
            
            # Return basic info about the data
            domains_path = target_path / "domains"
            if domains_path.exists():
                # Count domain files across all countries
                domain_count = 0
                country_dirs = [d for d in domains_path.iterdir() if d.is_dir()]
                for country_dir in country_dirs:
                    json_files = list(country_dir.glob("*.json"))
                    domain_count += len(json_files)
                
                return {
                    "domains": domain_count,
                    "countries": len(country_dirs),
                    "data_path": str(target_path)
                }
            else:
                raise ValueError("Downloaded data does not contain expected domains directory")

print("‚úÖ TrackerRadarParser class defined")
print("üöÄ Ready to download TrackerRadar data!")

# Step 3: Execute Data Download

Downloads the complete DuckDuckGo TrackerRadar dataset from GitHub, extracts it, and verifies the download. This provides the raw data for training the ML model.

In [None]:
# Step 3: Download TrackerRadar data
parser = TrackerRadarParser()

try:
    # Download the data
    tracker_info = parser.download_tracker_radar()
    
    print("üìä TrackerRadar Data Summary:")
    print(f"   Total domains: {tracker_info['domains']:,}")
    print(f"   Countries: {tracker_info['countries']}")
    print(f"   Data location: {tracker_info['data_path']}")
    
    # Verify the download
    data_path = Path(tracker_info['data_path'])
    if data_path.exists():
        print("‚úÖ TrackerRadar data successfully downloaded and verified!")
    else:
        print("‚ùå Data verification failed")
        
except Exception as e:
    print(f"‚ùå Failed to download TrackerRadar data: {e}")
    print("Please check your internet connection and try again")

# Step 4: Define Advanced Feature Extraction

Creates the domain feature extraction system with aggressive privacy detection improvements. This includes enhanced category-based tracking detection, legitimate domain protection, and reduced importance of resource counts for better accuracy.

In [None]:
# Step 4: Define domain feature extraction with aggressive improvements
import json
import numpy as np
from pathlib import Path
from typing import Dict, Optional

class DomainFeatureExtractor:
    """Extract features from TrackerRadar domain data with aggressive privacy detection"""
    
    def __init__(self, data_path: str):
        self.data_path = Path(data_path)
        self.domains_path = self.data_path / "domains"
        
    def parse_domain_json(self, json_path: Path) -> Optional[Dict]:
        """Parse a single domain JSON file with aggressive category-based scoring"""
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            domain_name = json_path.stem
            
            # Get base fingerprinting score
            base_fingerprinting = data.get('fingerprinting', 0)
            
            # AGGRESSIVE category-based tracking detection
            categories = data.get('categories', [])
            tracking_categories = [
                'Ad Motivated Tracking', 'Advertising', 'Analytics', 
                'Audience Measurement', 'Third-Party Analytics Marketing',
                'Cross-site Tracking', 'Fingerprinting'
            ]
            
            # Count tracking categories with very aggressive weighting
            tracking_category_count = sum(1 for cat in categories if cat in tracking_categories)
            
            # Additional aggressive categories
            aggressive_categories = ['Social - Share', 'Embedded Content']
            aggressive_count = sum(1 for cat in categories if cat in aggressive_categories)
            
            # VERY AGGRESSIVE scoring for multi-category tracking domains
            if tracking_category_count >= 4:  # Domains with 4+ tracking categories
                category_tracking_score = min(3, 2.5 + (tracking_category_count * 0.3) + (aggressive_count * 0.5))
            elif tracking_category_count >= 2:  # Domains with 2-3 tracking categories  
                category_tracking_score = min(3, 2.0 + (tracking_category_count * 0.4) + (aggressive_count * 0.4))
            else:  # Single or no tracking categories
                category_tracking_score = min(3, (tracking_category_count * 1.0) + (aggressive_count * 1.5))
            
            # Use maximum of base fingerprinting and enhanced category score
            enhanced_fingerprinting = max(base_fingerprinting, category_tracking_score)
            
            # Reduce num_resources importance by 70% (aggressive)
            raw_num_resources = len(data.get('resources', []))
            scaled_num_resources = raw_num_resources * 0.3  # 70% reduction
            
            # Legitimate domain allowlisting
            legitimate_domains = {
                'wikipedia.org', 'archive.org', 'mozilla.org', 'github.com',
                'stackoverflow.com', 'reddit.com', 'medium.com', 'twitter.com'
            }
            
            # Reduce global_prevalence impact for legitimate sites
            global_prevalence = data.get('prevalence', 0.0)
            if domain_name in legitimate_domains:
                global_prevalence = min(global_prevalence, 0.0005)  # Aggressive capping
            
            # Extract comprehensive features
            resources = data.get('resources', [])
            resource_types = {}
            resource_fingerprinting_scores = []
            
            for resource in resources:
                res_type = resource.get('type', 'unknown')
                resource_types[res_type] = resource_types.get(res_type, 0) + 1
                
                res_fingerprinting = resource.get('fingerprinting', 0)
                if res_fingerprinting > 0:
                    resource_fingerprinting_scores.append(res_fingerprinting)
            
            features = {
                'domain': domain_name,
                'fingerprinting': enhanced_fingerprinting,
                'cookies_prevalence': data.get('cookies', 0.0),
                'global_prevalence': global_prevalence,
                'num_sites': data.get('sites', 0),
                'num_subdomains': len(data.get('subdomains', [])),
                'num_cnames': len(data.get('cnames', [])),
                'num_resources': scaled_num_resources,
                'num_top_initiators': len(data.get('topInitiators', [])),
                'owner_present': 1 if data.get('owner') else 0,
                'resource_type_script_count': resource_types.get('script', 0),
                'resource_type_xhr_count': resource_types.get('xmlhttprequest', 0),
                'resource_type_image_count': resource_types.get('image', 0),
                'resource_type_css_count': resource_types.get('stylesheet', 0),
                'resource_type_font_count': resource_types.get('font', 0),
                'resource_type_media_count': resource_types.get('media', 0),
                'avg_resource_fingerprinting': np.mean(resource_fingerprinting_scores) if resource_fingerprinting_scores else 0.0,
                'has_example_sites': 1 if data.get('exampleSites') else 0
            }
            
            return features
            
        except Exception as e:
            print(f"Error parsing {json_path}: {e}")
            return None
    
    def extract_all_features(self) -> Dict[str, Dict]:
        """Extract features from all domain files"""
        all_features = {}
        
        if not self.domains_path.exists():
            raise ValueError(f"Domains directory not found: {self.domains_path}")
        
        # Process all country directories
        country_dirs = [d for d in self.domains_path.iterdir() if d.is_dir()]
        total_processed = 0
        
        print(f"üîç Processing domains from {len(country_dirs)} countries...")
        
        for country_dir in country_dirs:
            print(f"üìç Processing {country_dir.name}...")
            json_files = list(country_dir.glob("*.json"))
            
            for i, json_file in enumerate(json_files):
                features = self.parse_domain_json(json_file)
                if features:
                    all_features[features['domain']] = features
                    total_processed += 1
                
                # Progress update every 1000 files
                if (i + 1) % 1000 == 0:
                    print(f"   Processed {i + 1:,}/{len(json_files):,} files")
        
        print(f"‚úÖ Extracted features for {total_processed:,} domains")
        return all_features

print("‚úÖ DomainFeatureExtractor class defined with aggressive improvements")

# Step 5: Process All Domain Data

Extracts features from all ~51,000+ domains in the TrackerRadar dataset. This processes each domain's JSON file to create the training dataset with comprehensive privacy-focused features.

In [None]:
# Step 5: Extract features from all domains
extractor = DomainFeatureExtractor(tracker_info['data_path'])

print("üöÄ Starting feature extraction from all TrackerRadar domains...")
print("This may take 5-10 minutes for ~51,000 domains...")

# Extract features
all_domain_features = extractor.extract_all_features()

print(f"\nüìä Feature Extraction Summary:")
print(f"   Total domains processed: {len(all_domain_features):,}")

# Show sample features from a few domains
sample_domains = list(all_domain_features.keys())[:3]
print(f"\nüîç Sample domain features:")
for domain in sample_domains:
    features = all_domain_features[domain]
    print(f"\n{domain}:")
    for key, value in list(features.items())[:8]:  # Show first 8 features
        if key != 'domain':
            print(f"   {key}: {value}")

# Save features to file for later use
features_file = Path("../models/domain_features_notebook.json")
with open(features_file, 'w') as f:
    json.dump(all_domain_features, f)

print(f"\n‚úÖ Features saved to: {features_file}")
print("‚úÖ Feature extraction completed successfully!")

# Step 6: Train Machine Learning Model

Creates training targets based on domain characteristics and trains a LightGBM model to predict domain tracking intensity. Includes model evaluation, feature importance analysis, and saves the trained model for production use.

In [None]:
# Step 6: Create training targets and train ML model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr
import pandas as pd

class ModelTrainer:
    """Train the domain risk model with LightGBM"""
    
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.feature_names = None
        
    def create_training_targets(self, features_dict: Dict) -> Dict[str, float]:
        """Create training targets based on domain characteristics"""
        targets = {}
        
        print("üéØ Creating training targets...")
        
        for domain, features in features_dict.items():
            # Base tracking intensity from fingerprinting
            base_intensity = min(features['fingerprinting'] / 3.0, 1.0)
            
            # Boost for high cookie prevalence
            cookie_boost = min(features['cookies_prevalence'] * 0.3, 0.3)
            
            # Boost for high prevalence (indicates tracking)
            prevalence_boost = min(features['global_prevalence'] * 100, 0.2)
            
            # Resource-based tracking signals
            resource_boost = 0
            if features['num_resources'] > 10:  # Already scaled down
                resource_boost = 0.1
            if features['avg_resource_fingerprinting'] > 1:
                resource_boost += 0.1
                
            # Combine all factors
            tracking_intensity = min(base_intensity + cookie_boost + prevalence_boost + resource_boost, 1.0)
            
            # Apply some noise for regularization
            noise = np.random.normal(0, 0.05)
            tracking_intensity = max(0.0, min(1.0, tracking_intensity + noise))
            
            targets[domain] = tracking_intensity
        
        print(f"‚úÖ Created targets for {len(targets):,} domains")
        return targets
    
    def train_model(self, features_dict: Dict, targets_dict: Dict):
        """Train the LightGBM model"""
        print("üèãÔ∏è Training LightGBM model...")
        
        # Prepare data
        domains = list(features_dict.keys())
        
        # Create feature matrix
        feature_names = [k for k in features_dict[domains[0]].keys() if k != 'domain']
        self.feature_names = feature_names
        
        X = []
        y = []
        
        for domain in domains:
            if domain in targets_dict:
                features = features_dict[domain]
                feature_row = [features[name] for name in feature_names]
                X.append(feature_row)
                y.append(targets_dict[domain])
        
        X = np.array(X)
        y = np.array(y)
        
        print(f"Training data shape: {X.shape}")
        print(f"Target range: {y.min():.3f} to {y.max():.3f}")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Train LightGBM model
        try:
            import lightgbm as lgb
            
            self.model = lgb.LGBMRegressor(
                n_estimators=100,
                max_depth=6,
                learning_rate=0.1,
                random_state=42,
                verbose=-1
            )
            
            self.model.fit(X_train_scaled, y_train)
            
            # Evaluate
            y_pred = self.model.predict(X_test_scaled)
            
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            correlation, _ = spearmanr(y_test, y_pred)
            
            print(f"\\nüìä Model Performance:")
            print(f"   MAE: {mae:.4f}")
            print(f"   RMSE: {rmse:.4f}")
            print(f"   Spearman correlation: {correlation:.4f}")
            
            # Feature importance
            importances = self.model.feature_importances_
            feature_importance = list(zip(feature_names, importances))
            feature_importance.sort(key=lambda x: x[1], reverse=True)
            
            print(f"\\nüîù Top 10 Feature Importances:")
            for i, (feature, importance) in enumerate(feature_importance[:10]):
                print(f"   {i+1:2d}. {feature}: {importance:.0f}")
                
            return True
            
        except ImportError:
            print("‚ùå LightGBM not available, using RandomForest as fallback...")
            from sklearn.ensemble import RandomForestRegressor
            
            self.model = RandomForestRegressor(n_estimators=100, random_state=42)
            self.model.fit(X_train_scaled, y_train)
            
            y_pred = self.model.predict(X_test_scaled)
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            
            print(f"\\nüìä Model Performance (RandomForest):")
            print(f"   MAE: {mae:.4f}")
            print(f"   RMSE: {rmse:.4f}")
            
            return True
    
    def save_model(self, save_dir: str):
        """Save the trained model"""
        save_path = Path(save_dir)
        save_path.mkdir(exist_ok=True)
        
        model_artifacts = {
            'model': self.model,
            'scaler': self.scaler,
            'feature_names': self.feature_names,
            'model_type': 'lightgbm' if hasattr(self.model, 'feature_importances_') else 'random_forest'
        }
        
        model_file = save_path / "domain_risk_model_notebook.pkl"
        joblib.dump(model_artifacts, model_file)
        
        print(f"‚úÖ Model saved to: {model_file}")
        return str(model_file)

# Initialize and train the model
trainer = ModelTrainer()

# Create targets
targets = trainer.create_training_targets(all_domain_features)

# Train model
success = trainer.train_model(all_domain_features, targets)

if success:
    # Save model
    model_path = trainer.save_model("../models")
    print("‚úÖ Model training completed successfully!")
else:
    print("‚ùå Model training failed")