In [34]:
import zipfile
import os
import fiona
import tempfile
import xml.etree.ElementTree as ET
from os.path import splitext, join, basename, dirname, isfile, isdir
from pyproj import CRS
from xml.dom import minidom
from collections import Counter


# -----------------------------------
# PLACEHOLDER: Define input path
# -----------------------------------
INPUT_PATH = "TMBPoints_QGis_reprojectedoutput"  # Can be a ZIP file or a directory

# Automatically determine the output XML path
if zipfile.is_zipfile(INPUT_PATH):
    OUTPUT_XML_PATH = join(dirname(INPUT_PATH), f"{splitext(basename(INPUT_PATH))[0]}_Report.xml")
elif isdir(INPUT_PATH):
    OUTPUT_XML_PATH = join(INPUT_PATH, "Directory_Report.xml")
else:
    raise ValueError("Input must be a valid ZIP file or directory.")


# -----------------------------------
# Utility Functions
# -----------------------------------

def pretty_print_xml(element):
    """
    Takes an XML element and returns a pretty-printed string with indentation.
    """
    rough_string = ET.tostring(element, encoding="utf-8")
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")


def validate_shapefile_components(base_name, file_list):
    """
    Validate that all required shapefile components are present.

    Args:
        base_name (str): The base name of the shapefile (e.g., 'TMBPoints').
        file_list (list): List of all files available.

    Returns:
        dict: A dictionary indicating the presence of each component and missing components.
    """
    required_extensions = [".shp", ".shx", ".dbf"]
    optional_extensions = [".prj", ".cpg", ".sbn", ".sbx"]

    components = {ext: f"{base_name}{ext}" in file_list for ext in required_extensions + optional_extensions}
    return components


def list_files(input_path):
    """
    List all files in a directory or ZIP archive.

    Args:
        input_path (str): Path to the input (directory or ZIP file).

    Returns:
        list: List of all files (relative paths) and base directory.
    """
    if zipfile.is_zipfile(input_path):
        with zipfile.ZipFile(input_path, 'r') as zip_file:
            temp_dir = tempfile.TemporaryDirectory()
            zip_file.extractall(temp_dir.name)
            return zip_file.namelist(), temp_dir  # Return files and temp dir
    elif isdir(input_path):
        file_list = []
        for root, _, files in os.walk(input_path):
            for file in files:
                file_list.append(join(root, file))
        return file_list, None  # No temp dir needed for directories
    else:
        raise ValueError("Input must be a valid ZIP file or directory.")


# -----------------------------------
# Main Function
# -----------------------------------

def analyze_input_to_xml(input_path, output_xml_path):
    """
    Analyze a directory or ZIP archive, producing an XML report with details about its contents,
    including file hierarchy and geospatial metadata for shapefiles.

    Args:
        input_path (str): Path to the directory or ZIP archive.
        output_xml_path (str): Path to save the XML report.
    """
    try:
        # Root element of the XML document
        root = ET.Element("InputReport")
        ET.SubElement(root, "InputPath").text = input_path

        # List files and prepare working directory
        file_list, temp_dir = list_files(input_path)

        # Add file hierarchy to XML
        files_element = ET.SubElement(root, "Files")
        for file in file_list:
            ET.SubElement(files_element, "File", {"name": basename(file)})

        # Identify shapefiles in the input
        shapefiles = [f for f in file_list if splitext(f)[1].lower() == ".shp"]
        shapefiles_element = ET.SubElement(root, "Shapefiles")

        if shapefiles:
            for shapefile in shapefiles:
                shapefile_path = shapefile if temp_dir is None else join(temp_dir.name, shapefile)
                shapefile_element = ET.SubElement(shapefiles_element, "Shapefile", {"name": basename(shapefile)})

                # Validate shapefile components
                base_name = splitext(basename(shapefile))[0]
                components = validate_shapefile_components(base_name, [basename(f) for f in file_list])
                validation_element = ET.SubElement(shapefile_element, "Validation")
                for ext, present in components.items():
                    ET.SubElement(validation_element, "Component", {"name": ext, "present": str(present)})

                # Attempt to read the shapefile
                try:
                    with fiona.open(shapefile_path, "r") as src:
                        # CRS Information
                        crs = CRS(src.crs).to_string() if src.crs else "Unknown"
                        ET.SubElement(shapefile_element, "CRS").text = crs

                        # Bounding Box
                        bbox = src.bounds
                        bbox_element = ET.SubElement(shapefile_element, "BoundingBox")
                        ET.SubElement(bbox_element, "MinX").text = str(bbox[0])
                        ET.SubElement(bbox_element, "MinY").text = str(bbox[1])
                        ET.SubElement(bbox_element, "MaxX").text = str(bbox[2])
                        ET.SubElement(bbox_element, "MaxY").text = str(bbox[3])

                        # Feature Statistics
                        ET.SubElement(shapefile_element, "FeatureCount").text = str(len(src))
                        geometry_types = Counter([feature["geometry"]["type"] for feature in src])
                        geom_types_element = ET.SubElement(shapefile_element, "GeometryTypes")
                        for geom_type, count in geometry_types.items():
                            ET.SubElement(geom_types_element, "Type", {"name": geom_type}).text = str(count)

                        # Attributes
                        attributes = ET.SubElement(shapefile_element, "Attributes")
                        for field, field_info in src.schema["properties"].items():
                            attribute_element = ET.SubElement(attributes, "Field", {"name": field})
                            ET.SubElement(attribute_element, "Type").text = field_info

                        # Sample Data
                        sample_element = ET.SubElement(shapefile_element, "SampleFeatures")
                        for i, feature in enumerate(src):
                            if i >= 5:  # Limit to 5 features
                                break
                            feature_element = ET.SubElement(sample_element, "Feature", {"id": str(i + 1)})
                            for key, value in feature["properties"].items():
                                ET.SubElement(feature_element, key).text = str(value)

                except Exception as e:
                    ET.SubElement(shapefile_element, "Error").text = f"Failed to read shapefile: {e}"
        else:
            ET.SubElement(shapefiles_element, "Message").text = "No shapefiles found."

        # Pretty-print the XML and write it to the file
        pretty_xml = pretty_print_xml(root)
        with open(output_xml_path, "w", encoding="utf-8") as f:
            f.write(pretty_xml)
        print(f"XML report generated: {output_xml_path}")

    except Exception as e:
        print(f"Error analyzing input: {e}")


# -----------------------------------
# Main Execution Block
# -----------------------------------

if __name__ == "__main__":
    print(f"Analyzing input: {INPUT_PATH}")
    print(f"Output XML will be saved to: {OUTPUT_XML_PATH}")
    analyze_input_to_xml(INPUT_PATH, OUTPUT_XML_PATH)


Analyzing input: TMBPoints_QGis_reprojectedoutput
Output XML will be saved to: TMBPoints_QGis_reprojectedoutput/Directory_Report.xml
XML report generated: TMBPoints_QGis_reprojectedoutput/Directory_Report.xml
