"""
GlobalShip Logistics Workshop Dataset Generator
Creates realistic datasets for the Databricks Data Engineering Capstone Project

This script generates:
- Streaming JSON datasets (package scans, vehicle telemetry, facility events)
- Batch Parquet datasets (customers, facilities, routes, weather)
- Slowly changing dimension tables
- Reference/lookup tables

Usage:
    # Local development
    python3 data_generator.py
    
    # Databricks Unity Catalog Volume
    python3 data_generator.py "/Volumes/catalog/schema/volume_name/workshop_data"
    
    # Or call from code
    from data_generator import main
    main("/Volumes/my_catalog/my_schema/my_volume/globalship_data")
"""

In [0]:

#pip install json faker

In [0]:
#!pip install faker

In [0]:
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
import random
import uuid
import faker
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import time
import threading

# Initialize Faker for realistic data generation
fake = faker.Faker()
fake.add_provider(faker.providers.company)
fake.add_provider(faker.providers.address)

# Vehicle makes and models for realistic data
VEHICLE_MAKES = ["Ford", "Chevrolet", "Mercedes-Benz", "Volvo", "Freightliner", "Peterbilt", "Kenworth", "Mack", "International", "Isuzu"]
VEHICLE_MODELS = ["Transit", "Express", "Sprinter", "F-150", "Silverado", "VNL", "Cascadia", "T680", "Anthem", "NPR"]

def generate_vin():
    """Generate a realistic VIN number"""
    letters = "ABCDEFGHJKLMNPRSTUVWXYZ"
    numbers = "0123456789"
    return ''.join([
        random.choice(letters) for _ in range(3)
    ]) + ''.join([
        random.choice(numbers) for _ in range(5)
    ]) + ''.join([
        random.choice(letters + numbers) for _ in range(9)
    ])

def generate_license_plate():
    """Generate a realistic license plate"""
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    numbers = "0123456789"
    return ''.join([
        random.choice(letters) for _ in range(3)
    ]) + '-' + ''.join([
        random.choice(numbers) for _ in range(4)
    ])

class LogisticsDataGenerator:
    """Generate realistic logistics datasets for workshop"""
    
    def __init__(self, output_dir="workshop_datasets"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Create subdirectories
        (self.output_dir / "streaming").mkdir(exist_ok=True)
        (self.output_dir / "batch").mkdir(exist_ok=True)
        (self.output_dir / "reference").mkdir(exist_ok=True)
        
        # Seed data for consistency
        random.seed(42)
        np.random.seed(42)
        fake.seed_instance(42)
        
        # Business logic constants
        self.SERVICE_TYPES = ["STANDARD", "EXPRESS", "SAME_DAY", "INTERNATIONAL"]
        self.PACKAGE_STATUSES = ["PICKED_UP", "IN_TRANSIT", "AT_FACILITY", "OUT_FOR_DELIVERY", "DELIVERED", "EXCEPTION"]
        self.FACILITY_TYPES = ["HUB", "STATION", "DEPOT", "SORT_CENTER", "DELIVERY_CENTER"]
        self.VEHICLE_TYPES = ["TRUCK", "VAN", "AIRCRAFT", "TRAIN"]
        self.REGIONS = ["AMERICAS", "EMEA", "APAC"]
        
        # Generate base reference data first
        self.facilities_df = self._generate_facilities(5000)
        self.customers_df = self._generate_customers(100000)
        self.vehicles_df = self._generate_vehicles(180000)
        self.routes_df = self._generate_routes(50000)
        
        print(f"Initialized LogisticsDataGenerator with output directory: {self.output_dir}")

    def _generate_tracking_number(self):
        """Generate realistic tracking numbers"""
        return f"GS{random.randint(100000000000, 999999999999)}"
    
    def _generate_coordinates(self, region="AMERICAS"):
        """Generate realistic coordinates based on region"""
        if region == "AMERICAS":
            lat = random.uniform(25.0, 49.0)  # US latitude range
            lon = random.uniform(-125.0, -66.0)  # US longitude range
        elif region == "EMEA":
            lat = random.uniform(35.0, 70.0)  # Europe latitude range
            lon = random.uniform(-10.0, 50.0)  # Europe longitude range
        else:  # APAC
            lat = random.uniform(-45.0, 45.0)  # APAC latitude range
            lon = random.uniform(95.0, 180.0)  # APAC longitude range
        
        return round(lat, 6), round(lon, 6)

    # =================
    # REFERENCE DATA (Slowly Changing)
    # =================
    
    def _generate_facilities(self, count=5000):
        """Generate facilities master data - updates weekly"""
        facilities = []
        
        for i in range(count):
            region = random.choice(self.REGIONS)
            lat, lon = self._generate_coordinates(region)
            
            facility = {
                "facility_id": f"FAC{str(i).zfill(6)}",
                "facility_code": f"GS{chr(65 + i % 26)}{str(i % 1000).zfill(3)}",
                "facility_name": f"{fake.city()} {random.choice(self.FACILITY_TYPES).title()}",
                "facility_type": random.choice(self.FACILITY_TYPES),
                "region": region,
                "country": fake.country(),
                "state_province": fake.state(),
                "city": fake.city(),
                "postal_code": fake.postcode(),
                "street_address": fake.street_address(),
                "latitude": lat,
                "longitude": lon,
                "timezone": random.choice(["UTC-8", "UTC-5", "UTC", "UTC+1", "UTC+8"]),
                "capacity_packages_per_hour": random.randint(1000, 50000),
                "operating_hours_start": random.choice(["06:00", "07:00", "08:00"]),
                "operating_hours_end": random.choice(["18:00", "20:00", "22:00"]),
                "is_active": random.choice([True] * 95 + [False] * 5),  # 95% active
                "opened_date": fake.date_between(start_date="-10y", end_date="today"),
                "last_updated": datetime.now(timezone.utc).isoformat(),
                "manager_email": fake.email(),
                "phone": fake.phone_number(),
                "customs_facility": random.choice([True, False]) if region in ["AMERICAS", "EMEA"] else False
            }
            facilities.append(facility)
        
        return pd.DataFrame(facilities)
    
    def _generate_customers(self, count=100000):
        """Generate customer master data - updates daily"""
        customers = []
        
        for i in range(count):
            region = random.choice(self.REGIONS)
            lat, lon = self._generate_coordinates(region)
            
            # Customer segments affect shipping behavior
            segment = random.choices(
                ["ENTERPRISE", "SMB", "CONSUMER"], 
                weights=[20, 30, 50]
            )[0]
            
            customer = {
                "customer_id": f"CUST{str(i).zfill(8)}",
                "customer_type": segment,
                "company_name": fake.company() if segment != "CONSUMER" else None,
                "first_name": fake.first_name(),
                "last_name": fake.last_name(),
                "email": fake.email(),
                "phone": fake.phone_number(),
                "region": region,
                "country": fake.country(),
                "state_province": fake.state(),
                "city": fake.city(),
                "postal_code": fake.postcode(),
                "street_address": fake.street_address(),
                "latitude": lat,
                "longitude": lon,
                "account_created_date": fake.date_between(start_date="-5y", end_date="today"),
                "last_order_date": fake.date_between(start_date="-1y", end_date="today"),
                "total_lifetime_value": round(random.uniform(100, 50000), 2),
                "avg_monthly_shipments": random.randint(1, 500) if segment == "ENTERPRISE" else random.randint(1, 50),
                "preferred_service": random.choice(self.SERVICE_TYPES),
                "credit_limit": round(random.uniform(1000, 100000), 2),
                "payment_terms": random.choice(["NET_30", "NET_15", "PREPAID", "COD"]),
                "is_active": random.choice([True] * 90 + [False] * 10),
                "last_updated": datetime.now(timezone.utc).isoformat()
            }
            customers.append(customer)
        
        return pd.DataFrame(customers)
    
    def _generate_vehicles(self, count=180000):
        """Generate vehicle fleet data - updates when maintenance occurs"""
        vehicles = []
        
        for i in range(count):
            vehicle_type = random.choice(self.VEHICLE_TYPES)
            
            vehicle = {
                "vehicle_id": f"VEH{str(i).zfill(8)}",
                "vehicle_number": f"GS{str(i).zfill(6)}",
                "vehicle_type": vehicle_type,
                "make": random.choice(VEHICLE_MAKES) if vehicle_type in ["TRUCK", "VAN"] else random.choice(["Boeing", "Airbus", "Freight Train"]),
                "model": random.choice(VEHICLE_MODELS) if vehicle_type in ["TRUCK", "VAN"] else f"Model-{random.randint(100, 999)}",
                "year": random.randint(2015, 2024),
                "vin_number": generate_vin() if vehicle_type in ["TRUCK", "VAN"] else f"AIR{random.randint(100000, 999999)}",
                "license_plate": generate_license_plate() if vehicle_type in ["TRUCK", "VAN"] else None,
                "region": random.choice(self.REGIONS),
                "home_facility_code": random.choice(self.facilities_df["facility_code"].tolist()),
                "capacity_weight_kg": random.randint(500, 15000) if vehicle_type == "VAN" else random.randint(5000, 80000),
                "capacity_volume_m3": random.randint(10, 100),
                "fuel_type": random.choice(["DIESEL", "GASOLINE", "ELECTRIC", "HYBRID"]),
                "current_mileage_km": random.randint(10000, 500000),
                "last_maintenance_date": fake.date_between(start_date="-6m", end_date="today"),
                "last_maintenance_mileage": random.randint(8000, 480000),
                "next_maintenance_due_km": random.randint(520000, 600000),
                "gps_device_id": f"GPS{str(i).zfill(8)}",
                "telematics_device_id": f"TEL{str(i).zfill(8)}",
                "is_active": random.choice([True] * 85 + [False] * 15),  # 85% active (rest in maintenance)
                "driver_id": f"DRV{random.randint(1, 200000)}",
                "insurance_policy_number": f"INS{random.randint(1000000, 9999999)}",
                "registration_expires": fake.date_between(start_date="today", end_date="+2y"),
                "last_updated": datetime.now(timezone.utc).isoformat()
            }
            vehicles.append(vehicle)
        
        return pd.DataFrame(vehicles)
    
    def _generate_routes(self, count=50000):
        """Generate route master data - updates based on optimization"""
        routes = []
        
        facilities = self.facilities_df.to_dict('records')
        
        for i in range(count):
            origin = random.choice(facilities)
            destination = random.choice(facilities)
            
            # Ensure origin != destination
            while destination["facility_id"] == origin["facility_id"]:
                destination = random.choice(facilities)
            
            # Calculate approximate distance (simplified)
            distance_km = random.randint(50, 2000)
            
            route = {
                "route_id": f"RTE{str(i).zfill(8)}",
                "origin_facility_id": origin["facility_id"],
                "origin_facility_code": origin["facility_code"],
                "destination_facility_id": destination["facility_id"], 
                "destination_facility_code": destination["facility_code"],
                "distance_km": distance_km,
                "estimated_duration_hours": round(distance_km / random.randint(60, 120), 2),
                "route_type": random.choice(["GROUND", "AIR", "RAIL", "INTERMODAL"]),
                "is_active": random.choice([True] * 90 + [False] * 10),
                "average_cost_per_km": round(random.uniform(0.5, 2.5), 2),
                "carbon_footprint_kg_co2_per_km": round(random.uniform(0.1, 1.2), 3),
                "last_optimization_date": fake.date_between(start_date="-30d", end_date="today"),
                "created_date": fake.date_between(start_date="-2y", end_date="today"),
                "last_updated": datetime.now(timezone.utc).isoformat()
            }
            routes.append(route)
        
        return pd.DataFrame(routes)

    # =================
    # STREAMING DATA GENERATORS
    # =================
    
    def generate_package_scans_stream(self, duration_minutes=60, events_per_second=100):
        """Generate streaming package scan events"""
        print(f"Generating package scan stream for {duration_minutes} minutes at {events_per_second} events/second...")
        
        output_file = self.output_dir / "streaming" / "package_scans.jsonl"
        facilities = self.facilities_df.to_dict('records')
        
        start_time = datetime.now(timezone.utc)
        end_time = start_time + timedelta(minutes=duration_minutes)
        
        # Track packages for realistic journey progression
        active_packages = {}
        
        with open(output_file, 'w') as f:
            event_counter = 0
            
            while datetime.now(timezone.utc) < end_time:
                timestamp = datetime.now(timezone.utc)
                
                for _ in range(events_per_second):
                    # Decide if this is a new package or existing package progression
                    if random.random() < 0.1 or not active_packages:  # 10% new packages
                        tracking_number = self._generate_tracking_number()
                        status = "PICKED_UP"
                        facility = random.choice(facilities)
                        
                        # Initialize package journey
                        active_packages[tracking_number] = {
                            "current_status": status,
                            "current_facility": facility,
                            "service_type": random.choice(self.SERVICE_TYPES),
                            "journey_stage": 0
                        }
                    else:
                        # Progress existing package
                        tracking_number = random.choice(list(active_packages.keys()))
                        package_info = active_packages[tracking_number]
                        
                        # Progress through journey stages
                        journey_stages = ["PICKED_UP", "IN_TRANSIT", "AT_FACILITY", "OUT_FOR_DELIVERY", "DELIVERED"]
                        current_stage = package_info["journey_stage"]
                        
                        if current_stage < len(journey_stages) - 1:
                            if random.random() < 0.3:  # 30% chance to progress
                                package_info["journey_stage"] += 1
                                status = journey_stages[package_info["journey_stage"]]
                                package_info["current_status"] = status
                                
                                # Change facility for some status changes
                                if status in ["IN_TRANSIT", "AT_FACILITY"]:
                                    package_info["current_facility"] = random.choice(facilities)
                            else:
                                status = package_info["current_status"]
                        else:
                            status = "DELIVERED"
                            # Remove delivered packages occasionally
                            if random.random() < 0.1:
                                del active_packages[tracking_number]
                                continue
                        
                        facility = package_info["current_facility"]
                    
                    # Add some exceptions
                    if random.random() < 0.05:  # 5% exception rate
                        status = "EXCEPTION"
                        exception_type = random.choice(["DAMAGED", "LOST", "DELAYED", "ADDRESS_ISSUE", "WEATHER"])
                    else:
                        exception_type = None
                    
                    # Generate scan event
                    scan_event = {
                        "event_id": str(uuid.uuid4()),
                        "tracking_number": tracking_number,
                        "scan_event": status,
                        "facility_id": facility["facility_id"],
                        "facility_code": facility["facility_code"],
                        "facility_name": facility["facility_name"],
                        "latitude": facility["latitude"],
                        "longitude": facility["longitude"],
                        "scan_timestamp": timestamp.isoformat(),
                        "scanner_id": f"SCN{random.randint(1000, 9999)}",
                        "employee_id": f"EMP{random.randint(10000, 99999)}",
                        "vehicle_id": f"VEH{random.randint(10000000, 99999999)}" if status in ["IN_TRANSIT", "OUT_FOR_DELIVERY"] else None,
                        "service_type": active_packages.get(tracking_number, {}).get("service_type", random.choice(self.SERVICE_TYPES)),
                        "package_weight_kg": round(random.uniform(0.1, 50.0), 2),
                        "package_dimensions": {
                            "length_cm": random.randint(10, 100),
                            "width_cm": random.randint(10, 100), 
                            "height_cm": random.randint(5, 50)
                        },
                        "exception_type": exception_type,
                        "exception_description": fake.sentence() if exception_type else None,
                        "temperature_celsius": round(random.uniform(-20, 40), 1),
                        "humidity_percent": random.randint(30, 90),
                        "handling_instructions": random.choice([None, "FRAGILE", "PERISHABLE", "HAZMAT", "SIGNATURE_REQUIRED"]),
                        "customs_cleared": random.choice([True, False, None]),
                        "event_source": "HANDHELD_SCANNER",
                        "data_quality_score": round(random.uniform(0.85, 1.0), 3)
                    }
                    
                    f.write(json.dumps(scan_event) + '\n')
                    event_counter += 1
                
                # Sleep to maintain event rate
                time.sleep(1)
                
                if event_counter % 10000 == 0:
                    print(f"Generated {event_counter:,} package scan events...")
        
        print(f"Package scan stream generation complete. Generated {event_counter:,} events.")
        return output_file
    
    def generate_vehicle_telemetry_stream(self, duration_minutes=60, events_per_second=50):
        """Generate streaming vehicle telemetry data"""
        print(f"Generating vehicle telemetry stream for {duration_minutes} minutes at {events_per_second} events/second...")
        
        output_file = self.output_dir / "streaming" / "vehicle_telemetry.jsonl"
        vehicles = self.vehicles_df.to_dict('records')
        
        start_time = datetime.now(timezone.utc)
        end_time = start_time + timedelta(minutes=duration_minutes)
        
        # Track vehicle states for realistic progression
        vehicle_states = {}
        for vehicle in random.sample(vehicles, min(1000, len(vehicles))):  # Track subset for demo
            lat, lon = self._generate_coordinates(vehicle["region"])
            vehicle_states[vehicle["vehicle_id"]] = {
                "latitude": lat,
                "longitude": lon,
                "speed_kmh": 0,
                "heading": random.randint(0, 359),
                "is_moving": False
            }
        
        with open(output_file, 'w') as f:
            event_counter = 0
            
            while datetime.now(timezone.utc) < end_time:
                timestamp = datetime.now(timezone.utc)
                
                for _ in range(events_per_second):
                    vehicle_id = random.choice(list(vehicle_states.keys()))
                    vehicle = next(v for v in vehicles if v["vehicle_id"] == vehicle_id)
                    state = vehicle_states[vehicle_id]
                    
                    # Simulate realistic movement
                    if random.random() < 0.1:  # 10% chance to change movement state
                        state["is_moving"] = not state["is_moving"]
                    
                    if state["is_moving"]:
                        # Update position (simplified movement)
                        state["latitude"] += random.uniform(-0.001, 0.001)
                        state["longitude"] += random.uniform(-0.001, 0.001)
                        state["speed_kmh"] = random.randint(30, 90)
                        state["heading"] += random.randint(-10, 10) % 360
                    else:
                        state["speed_kmh"] = 0
                    
                    # Generate telemetry event
                    telemetry_event = {
                        "event_id": str(uuid.uuid4()),
                        "vehicle_id": vehicle_id,
                        "vehicle_number": vehicle["vehicle_number"],
                        "timestamp": timestamp.isoformat(),
                        "location": {
                            "latitude": round(state["latitude"], 6),
                            "longitude": round(state["longitude"], 6),
                            "altitude_m": random.randint(0, 2000),
                            "accuracy_m": random.randint(1, 10)
                        },
                        "motion": {
                            "speed_kmh": state["speed_kmh"],
                            "heading_degrees": state["heading"],
                            "acceleration_ms2": round(random.uniform(-2, 2), 2),
                            "is_moving": state["is_moving"]
                        },
                        "engine": {
                            "rpm": random.randint(800, 4000) if state["is_moving"] else random.randint(600, 1000),
                            "engine_load_percent": random.randint(10, 90) if state["is_moving"] else random.randint(5, 20),
                            "coolant_temp_celsius": random.randint(80, 105),
                            "oil_pressure_kpa": random.randint(200, 400),
                            "fuel_level_percent": random.randint(10, 100)
                        },
                        "environmental": {
                            "external_temp_celsius": round(random.uniform(-20, 40), 1),
                            "cargo_temp_celsius": round(random.uniform(-10, 35), 1) if vehicle["vehicle_type"] in ["TRUCK", "VAN"] else None,
                            "humidity_percent": random.randint(30, 90)
                        },
                        "vehicle_health": {
                            "odometer_km": vehicle["current_mileage_km"] + random.randint(0, 100),
                            "battery_voltage": round(random.uniform(12.0, 14.5), 1),
                            "tire_pressure_kpa": [random.randint(200, 250) for _ in range(4)],
                            "brake_wear_percent": random.randint(20, 100),
                            "engine_hours": random.randint(5000, 20000)
                        },
                        "driver": {
                            "driver_id": vehicle["driver_id"],
                            "driving_score": random.randint(70, 100),
                            "harsh_braking_events": random.randint(0, 3),
                            "harsh_acceleration_events": random.randint(0, 2),
                            "speeding_events": random.randint(0, 1),
                            "hours_driven_today": round(random.uniform(0, 10), 1)
                        },
                        "alerts": {
                            "maintenance_due": vehicle["current_mileage_km"] > vehicle["next_maintenance_due_km"],
                            "low_fuel": random.random() < 0.05,  # 5% chance
                            "engine_warning": random.random() < 0.02,  # 2% chance
                            "speeding": state["speed_kmh"] > 80 and random.random() < 0.1
                        },
                        "data_source": "TELEMATICS_DEVICE",
                        "device_id": vehicle["telematics_device_id"],
                        "signal_strength": random.randint(1, 5),
                        "data_quality_score": round(random.uniform(0.90, 1.0), 3)
                    }
                    
                    f.write(json.dumps(telemetry_event) + '\n')
                    event_counter += 1
                
                # Sleep to maintain event rate
                time.sleep(1)
                
                if event_counter % 5000 == 0:
                    print(f"Generated {event_counter:,} vehicle telemetry events...")
        
        print(f"Vehicle telemetry stream generation complete. Generated {event_counter:,} events.")
        return output_file
    
    def generate_facility_events_stream(self, duration_minutes=60, events_per_second=30):
        """Generate streaming facility operational events"""
        print(f"Generating facility events stream for {duration_minutes} minutes at {events_per_second} events/second...")
        
        output_file = self.output_dir / "streaming" / "facility_events.jsonl"
        facilities = self.facilities_df.to_dict('records')
        
        start_time = datetime.now(timezone.utc)
        end_time = start_time + timedelta(minutes=duration_minutes)
        
        with open(output_file, 'w') as f:
            event_counter = 0
            
            while datetime.now(timezone.utc) < end_time:
                timestamp = datetime.now(timezone.utc)
                
                for _ in range(events_per_second):
                    facility = random.choice(facilities)
                    
                    event_types = [
                        "CONVEYOR_STATUS", "SECURITY_ALERT", "TEMPERATURE_READING", 
                        "CAPACITY_UPDATE", "EQUIPMENT_STATUS", "POWER_CONSUMPTION",
                        "PERSONNEL_COUNT", "VEHICLE_DOCK", "MAINTENANCE_EVENT"
                    ]
                    
                    event_type = random.choice(event_types)
                    
                    # Generate event-specific data
                    event_data = {
                        "event_id": str(uuid.uuid4()),
                        "facility_id": facility["facility_id"],
                        "facility_code": facility["facility_code"],
                        "facility_type": facility["facility_type"],
                        "timestamp": timestamp.isoformat(),
                        "event_type": event_type,
                        "region": facility["region"]
                    }
                    
                    if event_type == "CONVEYOR_STATUS":
                        event_data.update({
                            "conveyor_id": f"CNV{random.randint(100, 999)}",
                            "status": random.choice(["RUNNING", "STOPPED", "MAINTENANCE", "ERROR"]),
                            "speed_mpm": random.randint(10, 100),
                            "packages_per_minute": random.randint(50, 500),
                            "jam_detected": random.random() < 0.05
                        })
                    
                    elif event_type == "TEMPERATURE_READING":
                        event_data.update({
                            "sensor_id": f"TEMP{random.randint(1000, 9999)}",
                            "zone": random.choice(["WAREHOUSE", "OFFICE", "LOADING_DOCK", "COLD_STORAGE"]),
                            "temperature_celsius": round(random.uniform(-10, 35), 1),
                            "humidity_percent": random.randint(30, 90),
                            "alert_triggered": random.random() < 0.1
                        })
                    
                    elif event_type == "CAPACITY_UPDATE":
                        event_data.update({
                            "current_capacity_percent": random.randint(30, 95),
                            "packages_in_facility": random.randint(1000, 50000),
                            "inbound_packages_today": random.randint(5000, 100000),
                            "outbound_packages_today": random.randint(4000, 95000),
                            "peak_capacity_reached": random.random() < 0.15
                        })
                    
                    elif event_type == "SECURITY_ALERT":
                        event_data.update({
                            "alert_level": random.choice(["LOW", "MEDIUM", "HIGH", "CRITICAL"]),
                            "alert_type": random.choice(["UNAUTHORIZED_ACCESS", "PACKAGE_TAMPERING", "PERIMETER_BREACH", "SYSTEM_INTRUSION"]),
                            "camera_id": f"CAM{random.randint(100, 999)}",
                            "response_required": random.random() < 0.3,
                            "description": fake.sentence()
                        })
                    
                    elif event_type == "VEHICLE_DOCK":
                        event_data.update({
                            "dock_id": f"DOCK{random.randint(10, 99)}",
                            "vehicle_id": f"VEH{random.randint(10000000, 99999999)}",
                            "action": random.choice(["ARRIVED", "DEPARTED", "LOADING", "UNLOADING"]),
                            "packages_loaded": random.randint(0, 500) if random.choice([True, False]) else 0,
                            "packages_unloaded": random.randint(0, 800) if random.choice([True, False]) else 0,
                            "dock_utilization_percent": random.randint(40, 100)
                        })
                    
                    f.write(json.dumps(event_data) + '\n')
                    event_counter += 1
                
                # Sleep to maintain event rate
                time.sleep(1)
                
                if event_counter % 2000 == 0:
                    print(f"Generated {event_counter:,} facility events...")
        
        print(f"Facility events stream generation complete. Generated {event_counter:,} events.")
        return output_file

    # =================
    # BATCH DATA GENERATORS
    # =================
    
    def generate_historical_packages(self, count=1000000):
        """Generate historical package data for training ML models"""
        print(f"Generating {count:,} historical packages...")
        
        packages = []
        facilities = self.facilities_df.to_dict('records')
        customers = self.customers_df.to_dict('records')
        
        for i in range(count):
            origin = random.choice(facilities)
            destination = random.choice(facilities)
            customer = random.choice(customers)
            
            # Ensure different origin/destination
            while destination["facility_id"] == origin["facility_id"]:
                destination = random.choice(facilities)
            
            ship_date = fake.date_between(start_date="-2y", end_date="-1d")
            service_type = random.choice(self.SERVICE_TYPES)
            
            # Calculate delivery based on service type and distance
            if service_type == "SAME_DAY":
                delivery_days = 1
            elif service_type == "EXPRESS":
                delivery_days = random.randint(1, 2)
            elif service_type == "STANDARD":
                delivery_days = random.randint(2, 5)
            else:  # INTERNATIONAL
                delivery_days = random.randint(5, 14)
            
            # Add realistic delays
            if random.random() < 0.15:  # 15% delay rate
                delivery_days += random.randint(1, 3)
            
            actual_delivery_date = ship_date + timedelta(days=delivery_days)
            
            package = {
                "package_id": f"PKG{str(i).zfill(10)}",
                "tracking_number": self._generate_tracking_number(),
                "customer_id": customer["customer_id"],
                "service_type": service_type,
                "ship_date": ship_date.isoformat(),
                "promised_delivery_date": (ship_date + timedelta(days=delivery_days - (1 if random.random() < 0.15 else 0))).isoformat(),
                "actual_delivery_date": actual_delivery_date.isoformat(),
                "origin_facility_id": origin["facility_id"],
                "origin_facility_code": origin["facility_code"],
                "destination_facility_id": destination["facility_id"],
                "destination_facility_code": destination["facility_code"],
                "origin_address": {
                    "street": fake.street_address(),
                    "city": origin["city"],
                    "state": origin["state_province"],
                    "postal_code": fake.postcode(),
                    "country": origin["country"],
                    "latitude": origin["latitude"],
                    "longitude": origin["longitude"]
                },
                "destination_address": {
                    "street": fake.street_address(),
                    "city": destination["city"],
                    "state": destination["state_province"],
                    "postal_code": fake.postcode(),
                    "country": destination["country"],
                    "latitude": destination["latitude"],
                    "longitude": destination["longitude"]
                },
                "package_details": {
                    "weight_kg": round(random.uniform(0.1, 50.0), 2),
                    "dimensions_cm": {
                        "length": random.randint(10, 100),
                        "width": random.randint(10, 100),
                        "height": random.randint(5, 50)
                    },
                    "declared_value": round(random.uniform(10, 5000), 2),
                    "insurance_value": round(random.uniform(0, 1000), 2),
                    "contents_description": fake.catch_phrase(),
                    "special_handling": random.choice([None, "FRAGILE", "PERISHABLE", "HAZMAT", "HIGH_VALUE"])
                },
                "shipping_cost": round(random.uniform(5.99, 299.99), 2),
                "fuel_surcharge": round(random.uniform(0.50, 15.00), 2),
                "taxes_fees": round(random.uniform(0, 25.00), 2),
                "total_revenue": 0,  # Will calculate
                "delivery_status": random.choices(
                    ["DELIVERED", "DELIVERED_LATE", "DAMAGED", "LOST", "RETURNED"], 
                    weights=[75, 15, 5, 2, 3]
                )[0],
                "delivery_attempts": random.randint(1, 4),
                "signature_required": random.choice([True, False]),
                "signature_obtained": random.choice([True, False]),
                "weather_delay": random.choice([True, False]) if random.random() < 0.1 else False,
                "customs_delay": random.choice([True, False]) if service_type == "INTERNATIONAL" and random.random() < 0.2 else False,
                "distance_km": random.randint(50, 5000),
                "carbon_footprint_kg": round(random.uniform(0.5, 25.0), 3),
                "created_timestamp": ship_date.isoformat(),
                "last_updated": datetime.now(timezone.utc).isoformat()
            }
            
            # Calculate total revenue
            package["total_revenue"] = (
                package["shipping_cost"] + 
                package["fuel_surcharge"] + 
                package["taxes_fees"]
            )
            
            packages.append(package)
            
            if i % 50000 == 0 and i > 0:
                print(f"Generated {i:,} historical packages...")
        
        df = pd.DataFrame(packages)
        output_file = self.output_dir / "batch" / "historical_packages.parquet"
        df.to_parquet(output_file, index=False)
        print(f"Historical packages saved to {output_file}")
        return output_file
    
    def generate_weather_data(self, days=365):
        """Generate weather data for logistics planning"""
        print(f"Generating weather data for {days} days...")
        
        weather_records = []
        facilities = self.facilities_df.to_dict('records')
        
        for facility in facilities:
            for day_offset in range(days):
                date = datetime.now().date() - timedelta(days=days-day_offset)
                
                # Generate realistic weather based on region and season
                season_factor = np.sin(2 * np.pi * day_offset / 365)  # Seasonal variation
                
                if facility["region"] == "AMERICAS":
                    base_temp = 15 + 10 * season_factor + random.uniform(-5, 5)
                elif facility["region"] == "EMEA":
                    base_temp = 10 + 8 * season_factor + random.uniform(-4, 4)
                else:  # APAC
                    base_temp = 25 + 5 * season_factor + random.uniform(-3, 3)
                
                weather_record = {
                    "facility_id": facility["facility_id"],
                    "facility_code": facility["facility_code"],
                    "date": date.isoformat(),
                    "temperature_celsius": {
                        "min": round(base_temp - random.uniform(2, 8), 1),
                        "max": round(base_temp + random.uniform(2, 8), 1),
                        "avg": round(base_temp, 1)
                    },
                    "humidity_percent": random.randint(30, 90),
                    "precipitation_mm": round(np.random.exponential(2) if random.random() < 0.3 else 0, 1),
                    "wind_speed_kmh": round(random.uniform(5, 40), 1),
                    "wind_direction_degrees": random.randint(0, 359),
                    "pressure_hpa": random.randint(980, 1030),
                    "visibility_km": round(random.uniform(5, 50), 1),
                    "uv_index": random.randint(1, 11),
                    "weather_conditions": random.choices(
                        ["CLEAR", "PARTLY_CLOUDY", "CLOUDY", "RAIN", "SNOW", "FOG", "STORM"],
                        weights=[30, 25, 20, 15, 5, 3, 2]
                    )[0],
                    "severe_weather_alert": random.random() < 0.05,  # 5% chance
                    "flight_delays_expected": random.random() < 0.1,  # 10% chance
                    "road_conditions": random.choice(["GOOD", "FAIR", "POOR", "HAZARDOUS"]),
                    "data_source": "WEATHER_SERVICE_API",
                    "last_updated": datetime.now(timezone.utc).isoformat()
                }
                
                weather_records.append(weather_record)
        
        df = pd.DataFrame(weather_records)
        output_file = self.output_dir / "batch" / "weather_data.parquet"
        df.to_parquet(output_file, index=False)
        print(f"Weather data saved to {output_file}")
        return output_file
    
    def generate_route_performance(self, count=500000):
        """Generate historical route performance data"""
        print(f"Generating {count:,} route performance records...")
        
        performance_records = []
        routes = self.routes_df.to_dict('records')
        vehicles = self.vehicles_df.to_dict('records')
        
        for i in range(count):
            route = random.choice(routes)
            vehicle = random.choice(vehicles)
            date = fake.date_between(start_date="-1y", end_date="today")
            
            # Simulate realistic performance variations
            base_duration = route["estimated_duration_hours"]
            actual_duration = base_duration * random.uniform(0.8, 1.4)
            
            performance = {
                "performance_id": f"PERF{str(i).zfill(8)}",
                "route_id": route["route_id"],
                "vehicle_id": vehicle["vehicle_id"],
                "driver_id": vehicle["driver_id"],
                "trip_date": date.isoformat(),
                "departure_time": fake.time(),
                "arrival_time": fake.time(),
                "planned_duration_hours": base_duration,
                "actual_duration_hours": round(actual_duration, 2),
                "planned_distance_km": route["distance_km"],
                "actual_distance_km": round(route["distance_km"] * random.uniform(0.95, 1.1), 1),
                "fuel_consumed_liters": round(route["distance_km"] * random.uniform(0.2, 0.5), 2),
                "fuel_cost": round(route["distance_km"] * random.uniform(0.15, 0.35), 2),
                "packages_delivered": random.randint(50, 500),
                "delivery_stops": random.randint(15, 100),
                "on_time_deliveries": 0,  # Will calculate
                "failed_deliveries": random.randint(0, 5),
                "average_stop_time_minutes": round(random.uniform(3, 12), 1),
                "traffic_delay_minutes": random.randint(0, 120),
                "weather_delay_minutes": random.randint(0, 60),
                "mechanical_delay_minutes": random.randint(0, 30),
                "driver_rating": round(random.uniform(3.5, 5.0), 1),
                "customer_complaints": random.randint(0, 3),
                "route_efficiency_score": round(random.uniform(0.7, 0.98), 3),
                "carbon_emissions_kg": round(route["distance_km"] * random.uniform(0.1, 0.3), 2),
                "cost_per_km": round(random.uniform(0.8, 2.2), 2),
                "revenue_generated": round(random.uniform(200, 2000), 2),
                "profit_margin": round(random.uniform(0.05, 0.25), 3)
            }
            
            # Calculate on-time deliveries
            performance["on_time_deliveries"] = max(0, 
                performance["packages_delivered"] - performance["failed_deliveries"] - random.randint(0, 10)
            )
            
            performance_records.append(performance)
            
            if i % 25000 == 0 and i > 0:
                print(f"Generated {i:,} route performance records...")
        
        df = pd.DataFrame(performance_records)
        output_file = self.output_dir / "batch" / "route_performance.parquet"
        df.to_parquet(output_file, index=False)
        print(f"Route performance data saved to {output_file}")
        return output_file
    
    def save_reference_data(self):
        """Save all reference/dimension tables as Parquet files"""
        print("Saving reference data tables...")
        
        # Save facilities
        facilities_file = self.output_dir / "reference" / "facilities.parquet"
        self.facilities_df.to_parquet(facilities_file, index=False)
        print(f"Facilities saved to {facilities_file}")
        
        # Save customers
        customers_file = self.output_dir / "reference" / "customers.parquet"
        self.customers_df.to_parquet(customers_file, index=False)
        print(f"Customers saved to {customers_file}")
        
        # Save vehicles
        vehicles_file = self.output_dir / "reference" / "vehicles.parquet"
        self.vehicles_df.to_parquet(vehicles_file, index=False)
        print(f"Vehicles saved to {vehicles_file}")
        
        # Save routes
        routes_file = self.output_dir / "reference" / "routes.parquet"
        self.routes_df.to_parquet(routes_file, index=False)
        print(f"Routes saved to {routes_file}")
        
        # Generate and save additional lookup tables
        self._generate_lookup_tables()
    
    def _generate_lookup_tables(self):
        """Generate additional lookup/reference tables"""
        
        # Service Level Agreements
        sla_data = []
        for service in self.SERVICE_TYPES:
            sla = {
                "service_type": service,
                "delivery_commitment_days": 1 if service == "SAME_DAY" else (2 if service == "EXPRESS" else (3 if service == "STANDARD" else 7)),
                "sla_threshold_hours": 24 if service == "SAME_DAY" else (48 if service == "EXPRESS" else (72 if service == "STANDARD" else 168)),
                "price_per_kg": round(random.uniform(5, 25), 2),
                "insurance_included": service in ["EXPRESS", "SAME_DAY"],
                "tracking_updates_frequency_minutes": 15 if service == "SAME_DAY" else (30 if service == "EXPRESS" else 60),
                "refund_policy": "FULL_REFUND" if service == "SAME_DAY" else ("50_PERCENT" if service == "EXPRESS" else "CREDIT"),
                "is_active": True,
                "last_updated": datetime.now(timezone.utc).isoformat()
            }
            sla_data.append(sla)
        
        sla_df = pd.DataFrame(sla_data)
        sla_file = self.output_dir / "reference" / "service_agreements.parquet"
        sla_df.to_parquet(sla_file, index=False)
        print(f"Service agreements saved to {sla_file}")
        
        # Geographic regions and postal codes
        postal_zones = []
        for i in range(10000):
            zone = {
                "postal_code": fake.postcode(),
                "city": fake.city(),
                "state_province": fake.state(),
                "country": fake.country(),
                "region": random.choice(self.REGIONS),
                "delivery_zone": random.choice(["URBAN", "SUBURBAN", "RURAL", "REMOTE"]),
                "average_delivery_time_hours": random.randint(2, 48),
                "special_instructions": random.choice([None, "GATED_COMMUNITY", "APARTMENT_COMPLEX", "BUSINESS_DISTRICT", "RESTRICTED_ACCESS"]),
                "is_serviceable": random.choice([True] * 95 + [False] * 5),
                "last_updated": datetime.now(timezone.utc).isoformat()
            }
            postal_zones.append(zone)
        
        postal_df = pd.DataFrame(postal_zones)
        postal_file = self.output_dir / "reference" / "postal_zones.parquet"
        postal_df.to_parquet(postal_file, index=False)
        print(f"Postal zones saved to {postal_file}")

    def generate_all_datasets(self):
        """Generate complete dataset suite for workshop"""
        print("=" * 80)
        print("GENERATING COMPLETE GLOBALSHIP LOGISTICS WORKSHOP DATASETS")
        print("=" * 80)
        
        # Generate reference data
        print("\n1. Saving Reference Data...")
        self.save_reference_data()
        
        # Generate batch datasets
        print("\n2. Generating Batch Datasets...")
        self.generate_historical_packages(500000)  # Reduced for workshop
        self.generate_weather_data(365)
        self.generate_route_performance(100000)  # Reduced for workshop
        
        # Generate streaming datasets (shorter duration for workshop)
        print("\n3. Generating Streaming Datasets...")
        
        # Start streaming data generation in parallel threads
        def stream_package_scans():
            self.generate_package_scans_stream(duration_minutes=10, events_per_second=50)
        
        def stream_vehicle_telemetry():
            self.generate_vehicle_telemetry_stream(duration_minutes=10, events_per_second=25)
        
        def stream_facility_events():
            self.generate_facility_events_stream(duration_minutes=10, events_per_second=15)
        
        # Start all streaming generators
        threads = [
            threading.Thread(target=stream_package_scans),
            threading.Thread(target=stream_vehicle_telemetry),
            threading.Thread(target=stream_facility_events)
        ]
        
        for thread in threads:
            thread.start()
        
        for thread in threads:
            thread.join()
        
        print("\n4. Generating Additional Files...")
        self._generate_config_files()
        self._generate_sample_queries()
        
        print("\n" + "=" * 80)
        print("DATASET GENERATION COMPLETE!")
        print("=" * 80)
        self._print_dataset_summary()
    
    def _generate_config_files(self):
        """Generate configuration and schema files"""
        
        # Data source configuration
        config = {
            "streaming_sources": {
                "package_scans": {
                    "format": "json",
                    "schema_evolution": True,
                    "checkpoint_location": "/mnt/checkpoints/package_scans",
                    "trigger": "10 seconds",
                    "watermark": "scan_timestamp"
                },
                "vehicle_telemetry": {
                    "format": "json", 
                    "schema_evolution": True,
                    "checkpoint_location": "/mnt/checkpoints/vehicle_telemetry",
                    "trigger": "30 seconds",
                    "watermark": "timestamp"
                },
                "facility_events": {
                    "format": "json",
                    "schema_evolution": True, 
                    "checkpoint_location": "/mnt/checkpoints/facility_events",
                    "trigger": "1 minute",
                    "watermark": "timestamp"
                }
            },
            "batch_sources": {
                "historical_packages": {
                    "format": "parquet",
                    "partitioned_by": ["ship_date"],
                    "update_frequency": "daily"
                },
                "weather_data": {
                    "format": "parquet", 
                    "partitioned_by": ["date", "facility_id"],
                    "update_frequency": "hourly"
                },
                "route_performance": {
                    "format": "parquet",
                    "partitioned_by": ["trip_date"],
                    "update_frequency": "daily"
                }
            },
            "reference_data": {
                "facilities": {"format": "parquet", "update_frequency": "weekly"},
                "customers": {"format": "parquet", "update_frequency": "daily"},
                "vehicles": {"format": "parquet", "update_frequency": "monthly"},
                "routes": {"format": "parquet", "update_frequency": "weekly"}
            }
        }
        
        config_file = self.output_dir / "config" / "data_sources.json"
        config_file.parent.mkdir(exist_ok=True)
        with open(config_file, 'w') as f:
            json.dump(config, f, indent=2)
        
        print(f"Configuration saved to {config_file}")
    
    def _generate_sample_queries(self):
        """Generate sample SQL queries for workshop"""
        
        queries = {
            "real_time_dashboard": """
-- Real-time Package Volume Dashboard
SELECT 
    facility_code,
    facility_name,
    COUNT(*) as packages_scanned_last_hour,
    COUNT(CASE WHEN scan_event = 'DELIVERED' THEN 1 END) as deliveries_completed,
    COUNT(CASE WHEN scan_event = 'EXCEPTION' THEN 1 END) as exceptions,
    AVG(CASE WHEN scan_event = 'DELIVERED' THEN 
        DATEDIFF(scan_timestamp, ship_timestamp) END) as avg_delivery_days
FROM package_scans_stream
WHERE scan_timestamp >= current_timestamp() - INTERVAL 1 HOUR
GROUP BY facility_code, facility_name
ORDER BY packages_scanned_last_hour DESC;
            """,
            
            "delivery_performance": """
-- Service Level Agreement Performance
WITH delivery_performance AS (
    SELECT 
        service_type,
        COUNT(*) as total_packages,
        COUNT(CASE WHEN actual_delivery_date <= promised_delivery_date THEN 1 END) as on_time_deliveries,
        AVG(DATEDIFF(actual_delivery_date, ship_date)) as avg_delivery_days,
        AVG(shipping_cost) as avg_revenue_per_package
    FROM historical_packages
    WHERE ship_date >= current_date() - INTERVAL 30 DAYS
    GROUP BY service_type
)
SELECT 
    service_type,
    total_packages,
    on_time_deliveries,
    ROUND((on_time_deliveries * 100.0 / total_packages), 2) as on_time_percentage,
    avg_delivery_days,
    avg_revenue_per_package
FROM delivery_performance
ORDER BY on_time_percentage DESC;
            """,
            
            "route_optimization": """
-- Route Efficiency Analysis
SELECT 
    r.route_id,
    r.origin_facility_code,
    r.destination_facility_code,
    r.distance_km,
    AVG(rp.actual_duration_hours) as avg_actual_duration,
    r.estimated_duration_hours,
    AVG(rp.fuel_consumed_liters) as avg_fuel_consumption,
    AVG(rp.packages_delivered) as avg_packages_per_trip,
    COUNT(rp.performance_id) as total_trips_last_month,
    AVG(rp.route_efficiency_score) as avg_efficiency_score
FROM routes r
JOIN route_performance rp ON r.route_id = rp.route_id
WHERE rp.trip_date >= current_date() - INTERVAL 30 DAYS
GROUP BY r.route_id, r.origin_facility_code, r.destination_facility_code, 
         r.distance_km, r.estimated_duration_hours
HAVING total_trips_last_month >= 10
ORDER BY avg_efficiency_score DESC;
            """
        }
        
        queries_file = self.output_dir / "config" / "sample_queries.sql"
        queries_file.parent.mkdir(exist_ok=True)
        with open(queries_file, 'w') as f:
            for name, query in queries.items():
                f.write(f"-- {name.upper()}\n{query}\n\n")
        
        print(f"Sample queries saved to {queries_file}")
    
    def _print_dataset_summary(self):
        """Print summary of generated datasets"""
        print("\nDATASET SUMMARY:")
        print("-" * 50)
        
        # Count files and sizes
        total_files = 0
        total_size = 0
        
        for subdir in ["streaming", "batch", "reference", "config"]:
            subdir_path = self.output_dir / subdir
            if subdir_path.exists():
                files = list(subdir_path.rglob("*"))
                files = [f for f in files if f.is_file()]
                subdir_size = sum(f.stat().st_size for f in files)
                
                print(f"{subdir.upper()}:")
                for file in files:
                    size_mb = file.stat().st_size / (1024 * 1024)
                    print(f"  - {file.name}: {size_mb:.1f} MB")
                
                total_files += len(files)
                total_size += subdir_size
        
        print(f"\nTOTAL: {total_files} files, {total_size / (1024 * 1024):.1f} MB")
        print(f"Output directory: {self.output_dir.absolute()}")
        
        print(f"\nREFERENCE DATA COUNTS:")
        print(f"  - Facilities: {len(self.facilities_df):,}")
        print(f"  - Customers: {len(self.customers_df):,}")
        print(f"  - Vehicles: {len(self.vehicles_df):,}")
        print(f"  - Routes: {len(self.routes_df):,}")



In [0]:
def main(volume_path=None):
    """Main function to generate all workshop datasets
    
    Args:
        volume_path (str, optional): Path to Databricks Unity Catalog volume 
                                   (e.g., "/Volumes/catalog/schema/volume_name/workshop_data")
                                   If None, uses local path "globalship_workshop_data"
    """
    print("GlobalShip Logistics Data Generator")
    print("=" * 50)
    
    # Set output directory - use volume path if provided, otherwise local path
    if volume_path:
        output_dir = volume_path
        print(f"Using Databricks Volume path: {output_dir}")
    else:
        output_dir = "globalship_workshop_data"
        print(f"Using local path: {output_dir}")
    
    # Initialize generator
    generator = LogisticsDataGenerator(output_dir)
    
    # Generate all datasets
    generator.generate_all_datasets()
    
    print("\nWorkshop datasets are ready!")
    if volume_path:
        print(f"Datasets are available in Databricks Volume: {volume_path}")
    else:
        print("You can now use these datasets with Databricks for the capstone project.")


In [0]:
data_path = "/Volumes/workspace/default/sample_data"
main(data_path)

GlobalShip Logistics Data Generator
Using Databricks Volume path: /Volumes/workspace/default/sample_data
Initialized LogisticsDataGenerator with output directory: /Volumes/workspace/default/sample_data
GENERATING COMPLETE GLOBALSHIP LOGISTICS WORKSHOP DATASETS

1. Saving Reference Data...
Saving reference data tables...
Facilities saved to /Volumes/workspace/default/sample_data/reference/facilities.parquet
Customers saved to /Volumes/workspace/default/sample_data/reference/customers.parquet
Vehicles saved to /Volumes/workspace/default/sample_data/reference/vehicles.parquet
Routes saved to /Volumes/workspace/default/sample_data/reference/routes.parquet
Service agreements saved to /Volumes/workspace/default/sample_data/reference/service_agreements.parquet
Postal zones saved to /Volumes/workspace/default/sample_data/reference/postal_zones.parquet

2. Generating Batch Datasets...
Generating 500,000 historical packages...
Generated 50,000 historical packages...
Generated 100,000 historical 