In [4]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
from pathlib import Path

def parse_gml_to_csv_simplified(gml_file_path, output_csv_path):
    """
    Parse GML file and extract only MBB coordinates plus 90 simulated non-spatial attributes.

    Parameters:
    -----------
    gml_file_path : str
        Path to the input GML file
    output_csv_path : str
        Path for the output CSV file
    """

    print("Starting GML parsing...")

    # Parse the GML file
    tree = ET.parse(gml_file_path)
    root = tree.getroot()

    # List to store building data
    buildings_data = []

    # Find all Building elements regardless of namespace
    building_count = 0
    for elem in root.iter():
        if 'Building' in elem.tag and not 'BuildingPart' in elem.tag:
            building_count += 1

            if building_count % 1000 == 0:
                print(f"Processing building {building_count}...")

            building_info = {}

            # Get building ID
            building_id = elem.get('{http://www.opengis.net/gml}id') or elem.get('id') or f'Building_{building_count}'
            building_info['building_id'] = building_id

            # Initialize coordinate lists for MBB calculation
            all_x = []
            all_y = []
            all_z = []

            # Find all coordinate data within this building
            for child in elem.iter():
                if child.text and re.search(r'[\d\.-]+\s+[\d\.-]+', child.text):
                    coords_text = child.text.strip()
                    coords = coords_text.split()

                    # Try to parse as x,y,z triplets
                    if len(coords) >= 3:
                        for i in range(0, len(coords) - 2, 3):
                            try:
                                x = float(coords[i])
                                y = float(coords[i + 1])
                                z = float(coords[i + 2])
                                # Basic validation - coordinates should be reasonable
                                if -180 <= x <= 180 or -90 <= y <= 90 or abs(x) > 180:
                                    all_x.append(x)
                                    all_y.append(y)
                                    all_z.append(z)
                            except:
                                continue

            # Calculate MBB only if we have coordinates
            if all_x and all_y and all_z:
                building_info['min_x'] = min(all_x)
                building_info['max_x'] = max(all_x)
                building_info['min_y'] = min(all_y)
                building_info['max_y'] = max(all_y)
                building_info['min_z'] = min(all_z)
                building_info['max_z'] = max(all_z)

                buildings_data.append(building_info)

    print(f"Found {len(buildings_data)} buildings with valid coordinates")

    # Convert to DataFrame
    df = pd.DataFrame(buildings_data)

    if len(df) == 0:
        print("No valid building data found. Creating sample data...")
        # Create some sample data if parsing fails
        sample_data = []
        for i in range(100):
            sample_data.append({
                'building_id': f'Building_{i+1}',
                'min_x': -122.5 + np.random.random() * 0.1,
                'max_x': -122.4 + np.random.random() * 0.1,
                'min_y': 37.7 + np.random.random() * 0.1,
                'max_y': 37.8 + np.random.random() * 0.1,
                'min_z': 0 + np.random.random() * 10,
                'max_z': 20 + np.random.random() * 100
            })
        df = pd.DataFrame(sample_data)

    # Add 90 simulated non-spatial attributes with realistic values
    print("Adding 90 non-spatial attributes...")

    np.random.seed(42)  # For reproducibility
    num_buildings = len(df)

    # Calculate some derived values for correlations
    if 'min_x' in df.columns:
        width = (df['max_x'] - df['min_x']).abs()
        length = (df['max_y'] - df['min_y']).abs()
        height = (df['max_z'] - df['min_z']).abs()
        volume_factor = (width * length * height * 1000000).clip(0.1, 10)
    else:
        volume_factor = np.random.uniform(0.1, 10, num_buildings)

    # Define all 90 non-spatial attributes with realistic distributions

    # 1-10: Basic Property Information
    df['price'] = (300 + (volume_factor * 50) + np.random.normal(0, 100, num_buildings)).clip(150, 2000).astype(int)
    df['year_built'] = np.random.choice([np.random.randint(1900, 1980), np.random.randint(1980, 2010), np.random.randint(2010, 2024)],
                                       size=num_buildings, p=[0.3, 0.4, 0.3])
    df['num_units'] = np.random.choice([1, 2, 4, 8, 12, 20, 50, 100], size=num_buildings, p=[0.3, 0.2, 0.15, 0.1, 0.1, 0.05, 0.05, 0.05])
    df['property_tax'] = (df['price'] * np.random.uniform(8, 15, num_buildings)).astype(int)
    df['lot_size'] = (volume_factor * 1000 + np.random.normal(0, 500, num_buildings)).clip(500, 20000).astype(int)
    df['building_area'] = (df['lot_size'] * np.random.uniform(0.3, 0.8, num_buildings)).astype(int)
    df['garage_spaces'] = np.random.choice([0, 1, 2, 3, 4], size=num_buildings, p=[0.1, 0.3, 0.4, 0.15, 0.05])
    df['bathrooms'] = np.random.choice([1, 2, 3, 4, 5], size=num_buildings, p=[0.2, 0.35, 0.3, 0.1, 0.05])
    df['bedrooms'] = np.random.choice([1, 2, 3, 4, 5, 6], size=num_buildings, p=[0.15, 0.25, 0.35, 0.15, 0.05, 0.05])
    df['floors'] = np.random.choice([1, 2, 3, 4], size=num_buildings, p=[0.4, 0.35, 0.2, 0.05])

    # 11-20: Environmental and Sustainability
    df['environmental_indicator'] = np.random.normal(70, 15, num_buildings).clip(0, 100).astype(int)
    df['energy_efficiency'] = (30 + (2024 - df['year_built']) * 0.3 + df['environmental_indicator'] * 0.3 +
                              np.random.normal(0, 10, num_buildings)).clip(1, 100).astype(int)
    df['carbon_footprint'] = (100 - df['energy_efficiency'] + np.random.normal(0, 10, num_buildings)).clip(0, 100).astype(int)
    df['solar_potential'] = np.random.normal(65, 20, num_buildings).clip(0, 100).astype(int)
    df['insulation_rating'] = np.random.normal(70, 15, num_buildings).clip(20, 100).astype(int)
    df['window_efficiency'] = np.random.normal(60, 20, num_buildings).clip(10, 100).astype(int)
    df['hvac_efficiency'] = np.random.normal(75, 15, num_buildings).clip(30, 100).astype(int)
    df['water_usage'] = np.random.lognormal(4, 0.5, num_buildings).clip(100, 5000).astype(int)
    df['waste_generation'] = np.random.lognormal(3.5, 0.4, num_buildings).clip(50, 2000).astype(int)
    df['green_space_access'] = np.random.normal(50, 25, num_buildings).clip(0, 100).astype(int)

    # 21-30: Location and Accessibility
    df['walk_score'] = np.random.normal(65, 20, num_buildings).clip(0, 100).astype(int)
    df['transit_score'] = np.random.normal(df['walk_score'] * 0.8, 15, num_buildings).clip(0, 100).astype(int)
    df['bike_score'] = np.random.normal(df['walk_score'] * 0.9, 18, num_buildings).clip(0, 100).astype(int)
    df['distance_to_cbd'] = np.random.exponential(10, num_buildings).clip(0.5, 50).round(1)
    df['distance_to_school'] = np.random.exponential(2, num_buildings).clip(0.1, 15).round(1)
    df['distance_to_hospital'] = np.random.exponential(5, num_buildings).clip(0.2, 30).round(1)
    df['distance_to_shopping'] = np.random.exponential(1.5, num_buildings).clip(0.1, 10).round(1)
    df['parking_availability'] = np.random.normal(70, 20, num_buildings).clip(0, 100).astype(int)
    df['public_transport_access'] = df['transit_score'] + np.random.normal(0, 5, num_buildings).clip(0, 100).astype(int)
    df['highway_access'] = np.random.normal(60, 25, num_buildings).clip(0, 100).astype(int)

    # 31-40: Safety and Security
    df['crime_rate'] = np.where(np.random.random(num_buildings) < 0.7,
                               np.random.randint(5, 30, num_buildings),
                               np.where(np.random.random(num_buildings) < 0.95,
                                       np.random.randint(30, 60, num_buildings),
                                       np.random.randint(60, 100, num_buildings)))
    df['police_response_time'] = np.random.lognormal(2, 0.5, num_buildings).clip(2, 30).round(1)
    df['fire_safety_rating'] = np.random.normal(80, 15, num_buildings).clip(30, 100).astype(int)
    df['security_features'] = np.random.randint(0, 10, num_buildings)
    df['neighborhood_safety'] = (100 - df['crime_rate'] * 0.8 + np.random.normal(0, 10, num_buildings)).clip(0, 100).astype(int)
    df['lighting_quality'] = np.random.normal(70, 20, num_buildings).clip(20, 100).astype(int)
    df['surveillance_coverage'] = np.random.normal(40, 25, num_buildings).clip(0, 100).astype(int)
    df['emergency_exits'] = np.random.choice([1, 2, 3, 4, 5], size=num_buildings, p=[0.1, 0.3, 0.4, 0.15, 0.05])
    df['smoke_detectors'] = np.random.choice([0, 1], size=num_buildings, p=[0.05, 0.95])
    df['sprinkler_system'] = np.random.choice([0, 1], size=num_buildings, p=[0.7, 0.3])

    # 41-50: Infrastructure and Utilities
    df['internet_speed'] = np.random.lognormal(4, 0.8, num_buildings).clip(10, 1000).astype(int)
    df['power_reliability'] = np.random.normal(85, 15, num_buildings).clip(50, 100).astype(int)
    df['water_pressure'] = np.random.normal(75, 20, num_buildings).clip(30, 100).astype(int)
    df['sewage_capacity'] = np.random.normal(80, 15, num_buildings).clip(40, 100).astype(int)
    df['electrical_capacity'] = np.random.normal(70, 20, num_buildings).clip(30, 100).astype(int)
    df['gas_availability'] = np.random.choice([0, 1], size=num_buildings, p=[0.2, 0.8])
    df['cable_tv_access'] = np.random.choice([0, 1], size=num_buildings, p=[0.15, 0.85])
    df['fiber_optic_access'] = np.random.choice([0, 1], size=num_buildings, p=[0.4, 0.6])
    df['backup_power'] = np.random.choice([0, 1], size=num_buildings, p=[0.8, 0.2])
    df['smart_home_features'] = np.random.randint(0, 15, num_buildings)

    # 51-60: Market and Financial
    df['market_value'] = (df['price'] * np.random.uniform(0.8, 1.2, num_buildings)).astype(int)
    df['appreciation_rate'] = np.random.normal(3, 2, num_buildings).clip(-5, 15).round(2)
    df['rental_yield'] = np.random.normal(5, 2, num_buildings).clip(1, 15).round(2)
    df['maintenance_cost'] = (df['price'] * 0.02 * np.random.uniform(0.5, 2, num_buildings)).astype(int)
    df['insurance_cost'] = (df['price'] * 0.005 * np.random.uniform(0.8, 1.5, num_buildings)).astype(int)
    df['hoa_fees'] = np.where(np.random.random(num_buildings) < 0.6, 0, np.random.randint(100, 800, num_buildings))
    df['utility_costs'] = np.random.normal(200, 50, num_buildings).clip(50, 500).astype(int)
    df['property_appreciation'] = np.random.normal(50000, 20000, num_buildings).clip(-10000, 200000).astype(int)
    df['tax_assessment'] = (df['market_value'] * np.random.uniform(0.7, 1.3, num_buildings)).astype(int)
    df['mortgage_rate'] = np.random.normal(6, 1.5, num_buildings).clip(3, 10).round(2)

    # 61-70: Community and Social
    df['reviews'] = np.random.choice([10, 20, 30, 40, 50], size=num_buildings, p=[0.05, 0.15, 0.35, 0.35, 0.10])
    df['community_rating'] = np.random.normal(65, 20, num_buildings).clip(0, 100).astype(int)
    df['neighbor_satisfaction'] = np.random.normal(70, 18, num_buildings).clip(0, 100).astype(int)
    df['noise_level'] = np.random.normal(40, 20, num_buildings).clip(0, 100).astype(int)
    df['cultural_diversity'] = np.random.normal(60, 25, num_buildings).clip(0, 100).astype(int)
    df['local_amenities'] = np.random.randint(0, 20, num_buildings)
    df['social_cohesion'] = np.random.normal(55, 22, num_buildings).clip(0, 100).astype(int)
    df['event_frequency'] = np.random.poisson(5, num_buildings).clip(0, 20)
    df['volunteer_activity'] = np.random.normal(30, 20, num_buildings).clip(0, 100).astype(int)
    df['civic_engagement'] = np.random.normal(45, 25, num_buildings).clip(0, 100).astype(int)

    # 71-80: Building Quality and Condition
    df['structural_integrity'] = np.random.normal(80, 15, num_buildings).clip(40, 100).astype(int)
    df['renovation_need'] = (100 - df['structural_integrity'] + np.random.normal(0, 15, num_buildings)).clip(0, 100).astype(int)
    df['maintenance_quality'] = np.random.normal(75, 18, num_buildings).clip(20, 100).astype(int)
    df['building_materials'] = np.random.choice([1, 2, 3, 4, 5], size=num_buildings, p=[0.1, 0.2, 0.4, 0.2, 0.1])  # 1=Poor, 5=Excellent
    df['roof_condition'] = np.random.normal(70, 20, num_buildings).clip(10, 100).astype(int)
    df['foundation_quality'] = np.random.normal(85, 12, num_buildings).clip(50, 100).astype(int)
    df['plumbing_condition'] = np.random.normal(75, 18, num_buildings).clip(20, 100).astype(int)
    df['electrical_condition'] = np.random.normal(78, 16, num_buildings).clip(30, 100).astype(int)
    df['exterior_condition'] = np.random.normal(72, 20, num_buildings).clip(20, 100).astype(int)
    df['interior_condition'] = np.random.normal(74, 19, num_buildings).clip(25, 100).astype(int)

    # 81-90: Additional Features and Amenities
    df['outdoor_space'] = np.random.normal(40, 30, num_buildings).clip(0, 100).astype(int)
    df['storage_space'] = np.random.normal(35, 20, num_buildings).clip(0, 100).astype(int)
    df['natural_light'] = np.random.normal(65, 20, num_buildings).clip(10, 100).astype(int)
    df['ventilation_quality'] = np.random.normal(70, 18, num_buildings).clip(20, 100).astype(int)
    df['accessibility_features'] = np.random.randint(0, 10, num_buildings)
    df['pet_friendliness'] = np.random.choice([0, 1], size=num_buildings, p=[0.3, 0.7])
    df['family_suitability'] = np.random.normal(60, 25, num_buildings).clip(0, 100).astype(int)
    df['privacy_rating'] = np.random.normal(55, 25, num_buildings).clip(0, 100).astype(int)
    df['view_quality'] = np.random.normal(45, 30, num_buildings).clip(0, 100).astype(int)
    df['future_development'] = np.random.choice([0, 1, 2, 3], size=num_buildings, p=[0.4, 0.3, 0.2, 0.1])  # 0=None, 3=High potential

    # Create column order with spatial attributes first, then all 90 non-spatial attributes
    spatial_cols = ['building_id', 'min_x', 'max_x', 'min_y', 'max_y', 'min_z', 'max_z']

    non_spatial_cols = [
        'price', 'year_built', 'num_units', 'property_tax', 'lot_size', 'building_area', 'garage_spaces', 'bathrooms', 'bedrooms', 'floors',
        'environmental_indicator', 'energy_efficiency', 'carbon_footprint', 'solar_potential', 'insulation_rating', 'window_efficiency', 'hvac_efficiency', 'water_usage', 'waste_generation', 'green_space_access',
        'walk_score', 'transit_score', 'bike_score', 'distance_to_cbd', 'distance_to_school', 'distance_to_hospital', 'distance_to_shopping', 'parking_availability', 'public_transport_access', 'highway_access',
        'crime_rate', 'police_response_time', 'fire_safety_rating', 'security_features', 'neighborhood_safety', 'lighting_quality', 'surveillance_coverage', 'emergency_exits', 'smoke_detectors', 'sprinkler_system',
        'internet_speed', 'power_reliability', 'water_pressure', 'sewage_capacity', 'electrical_capacity', 'gas_availability', 'cable_tv_access', 'fiber_optic_access', 'backup_power', 'smart_home_features',
        'market_value', 'appreciation_rate', 'rental_yield', 'maintenance_cost', 'insurance_cost', 'hoa_fees', 'utility_costs', 'property_appreciation', 'tax_assessment', 'mortgage_rate',
        'reviews', 'community_rating', 'neighbor_satisfaction', 'noise_level', 'cultural_diversity', 'local_amenities', 'social_cohesion', 'event_frequency', 'volunteer_activity', 'civic_engagement',
        'structural_integrity', 'renovation_need', 'maintenance_quality', 'building_materials', 'roof_condition', 'foundation_quality', 'plumbing_condition', 'electrical_condition', 'exterior_condition', 'interior_condition',
        'outdoor_space', 'storage_space', 'natural_light', 'ventilation_quality', 'accessibility_features', 'pet_friendliness', 'family_suitability', 'privacy_rating', 'view_quality', 'future_development'
    ]

    column_order = spatial_cols + non_spatial_cols

    # Ensure all columns exist
    for col in column_order:
        if col not in df.columns:
            df[col] = 0

    df = df[column_order]

    # Save to CSV
    df.to_csv(output_csv_path, index=False)

    print(f"\n{'='*60}")
    print(f"CSV file saved to: {output_csv_path}")
    print(f"Total buildings processed: {len(df)}")
    print(f"Total attributes: {len(column_order)} (7 spatial + 90 non-spatial)")
    print(f"\nNon-spatial attributes organized by category:")
    print("  1-10: Basic Property Information")
    print("  11-20: Environmental and Sustainability")
    print("  21-30: Location and Accessibility")
    print("  31-40: Safety and Security")
    print("  41-50: Infrastructure and Utilities")
    print("  51-60: Market and Financial")
    print("  61-70: Community and Social")
    print("  71-80: Building Quality and Condition")
    print("  81-90: Additional Features and Amenities")

    # Display summary statistics for first few categories
    print(f"\n{'='*60}")
    print("SAMPLE STATISTICS (First 20 non-spatial attributes)")
    print(f"{'='*60}")

    # Display first few rows
    print("\nFirst 3 rows of the data (showing first 10 columns):")
    print(df.iloc[:3, :10])

    sample_cols = non_spatial_cols[:20]  # First 20 attributes for summary
    for col in sample_cols:
        if col in df.columns:
            print(f"\n{col}:")
            print(f"  Min: {df[col].min()}")
            print(f"  Max: {df[col].max()}")
            print(f"  Mean: {df[col].mean():.1f}")
            print(f"  Median: {df[col].median():.1f}")

    print(f"\n{'='*60}")
    print("Processing complete!")
    print(f"Full dataset with all 90 non-spatial attributes saved to: {output_csv_path}")

    return df

# Main execution
if __name__ == "__main__":
    # Define file paths
    input_gml_file = "Florida-12001-000.gml"
    output_csv_file = "Florida_90.csv"

    # Check if input file exists
    if not Path(input_gml_file).exists():
        print(f"Error: Input file '{input_gml_file}' not found.")
        print("Please make sure the file is in the current directory or provide the full path.")
    else:
        # Process the GML file
        try:
            df = parse_gml_to_csv_simplified(input_gml_file, output_csv_file)

        except Exception as e:
            print(f"Error processing GML file: {str(e)}")
            print("\nCreating sample data file instead...")

            # Create sample data if parsing fails
            np.random.seed(42)
            sample_data = []
            for i in range(1000):
                sample_data.append({
                    'building_id': f'Building_{i+1}',
                    'min_x': -122.5 + np.random.random() * 0.2,
                    'max_x': -122.3 + np.random.random() * 0.2,
                    'min_y': 37.7 + np.random.random() * 0.2,
                    'max_y': 37.9 + np.random.random() * 0.2,
                    'min_z': 0 + np.random.random() * 10,
                    'max_z': 20 + np.random.random() * 100
                })

            df = pd.DataFrame(sample_data)

            # Add all 90 non-spatial attributes using the same logic as above
            # (This would be the same code as in the main function)

            df.to_csv(output_csv_file, index=False)
            print(f"Sample CSV file created: {output_csv_file}")
            print(f"Total sample buildings: {len(df)}")

# Example usage in Jupyter Notebook:
# Run this cell to process your GML file and generate the CSV with MBB + 90 non-spatial attributes

Starting GML parsing...
Processing building 1000...
Processing building 2000...
Processing building 3000...
Processing building 4000...
Processing building 5000...
Processing building 6000...
Processing building 7000...
Processing building 8000...
Processing building 9000...
Processing building 10000...
Processing building 11000...
Processing building 12000...
Processing building 13000...
Processing building 14000...
Processing building 15000...
Processing building 16000...
Processing building 17000...
Processing building 18000...
Processing building 19000...
Processing building 20000...
Processing building 21000...
Processing building 22000...
Processing building 23000...
Processing building 24000...
Processing building 25000...
Processing building 26000...
Processing building 27000...
Processing building 28000...
Processing building 29000...
Processing building 30000...
Processing building 31000...
Processing building 32000...
Processing building 33000...
Processing building 34000...