In [1]:
pip install lxml

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import xml.etree.ElementTree as ET

def parse_xml_to_geojson(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Extract data and convert to GeoJSON format
    features = []
    
    # Example for extracting point data (latitude and longitude) from the XML
    for placemark in root.findall('.//Placemark'):
        name = placemark.find('name').text if placemark.find('name') is not None else 'N/A'
        coordinates = placemark.find('.//coordinates').text.split(',')
        longitude = float(coordinates[0])
        latitude = float(coordinates[1])

        feature = {
            "type": "Feature",
            "geometry": {
                "type": "Point",
                "coordinates": [longitude, latitude]
            },
            "properties": {
                "name": name
            }
        }
        features.append(feature)

    # Create the GeoJSON structure
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    return geojson

def convert_folder_xml_to_geojson(folder_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Process each XML file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            xml_file = os.path.join(folder_path, filename)
            geojson_data = parse_xml_to_geojson(xml_file)

            # Save the GeoJSON to a file
            geojson_filename = os.path.splitext(filename)[0] + '.geojson'
            geojson_filepath = os.path.join(output_folder, geojson_filename)
            with open(geojson_filepath, 'w') as geojson_file:
                json.dump(geojson_data, geojson_file, indent=2)

# Example usage
input_folder = 'datasetsinyalepaper/TCGA_BRCA_Filtered/Annotations'
output_folder = 'newgeojson'
convert_folder_xml_to_geojson(input_folder, output_folder)

In [3]:
import xml.etree.ElementTree as ET
import json
import os

def xml_to_geojson(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    annotations = []
    
    for annotation in root.findall(".//Annotation"):
        for region in annotation.findall(".//Region"):
            coordinates = []
            for vertex in region.findall(".//Vertex"):
                x = float(vertex.attrib['X'])
                y = float(vertex.attrib['Y'])
                z = float(vertex.attrib['Z'])
                coordinates.append([x, y, z])
                
            # Closing the loop by adding the first point to the end of the list
            coordinates.append(coordinates[0])
            
            # Create GeoJSON structure
            geojson_feature = {
                "type": "Feature",
                "geometry": {
                    "type": "Polygon",
                    "coordinates": [coordinates]
                },
                "properties": {
                    "Id": region.attrib.get("Id"),
                    "Type": region.attrib.get("Type"),
                    "Zoom": region.attrib.get("Zoom"),
                    "Selected": region.attrib.get("Selected"),
                    "ImageLocation": region.attrib.get("ImageLocation"),
                    "ImageFocus": region.attrib.get("ImageFocus"),
                    "Length": region.attrib.get("Length"),
                    "Area": region.attrib.get("Area"),
                    "LengthMicrons": region.attrib.get("LengthMicrons"),
                    "AreaMicrons": region.attrib.get("AreaMicrons"),
                    "Text": region.attrib.get("Text"),
                    "NegativeROA": region.attrib.get("NegativeROA"),
                    "InputRegionId": region.attrib.get("InputRegionId"),
                    "Analyze": region.attrib.get("Analyze"),
                    "DisplayId": region.attrib.get("DisplayId")
                }
            }
            annotations.append(geojson_feature)
    
    geojson = {
        "type": "FeatureCollection",
        "features": annotations
    }
    
    return geojson

def convert_folder_to_geojson(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".xml"):
            xml_path = os.path.join(input_folder, filename)
            geojson_data = xml_to_geojson(xml_path)
            
            output_filename = f"{os.path.splitext(filename)[0]}.geojson"
            geojson_path = os.path.join(output_folder, output_filename)
            
            with open(geojson_path, "w") as geojson_file:
                json.dump(geojson_data, geojson_file, indent=4)
            
            print(f"Converted {filename} to {output_filename}")

input_folder = "datasetsinyalepaper/TCGA_BRCA_Filtered/Annotations"  # Replace with your input folder path
output_folder = "newgeojson"  # Replace with your output folder path

convert_folder_to_geojson(input_folder, output_folder)


Converted TCGA-D8-A1JB-01Z-00-DX1.xml to TCGA-D8-A1JB-01Z-00-DX1.geojson
Converted TCGA-AR-A1AN-01Z-00-DX1.xml to TCGA-AR-A1AN-01Z-00-DX1.geojson
Converted TCGA-A8-A075-01Z-00-DX1.xml to TCGA-A8-A075-01Z-00-DX1.geojson
Converted TCGA-E2-A158-01Z-00-DX1.xml to TCGA-E2-A158-01Z-00-DX1.geojson
Converted TCGA-E9-A22A-01Z-00-DX1.xml to TCGA-E9-A22A-01Z-00-DX1.geojson
Converted TCGA-B6-A0IK-01Z-00-DX1.xml to TCGA-B6-A0IK-01Z-00-DX1.geojson
Converted TCGA-E2-A1B0-01Z-00-DX1.xml to TCGA-E2-A1B0-01Z-00-DX1.geojson
Converted TCGA-C8-A275-01Z-00-DX1.xml to TCGA-C8-A275-01Z-00-DX1.geojson
Converted TCGA-BH-A0AW-01Z-00-DX1.xml to TCGA-BH-A0AW-01Z-00-DX1.geojson
Converted TCGA-A8-A0A2-01Z-00-DX1.xml to TCGA-A8-A0A2-01Z-00-DX1.geojson
Converted TCGA-A2-A0CX-01Z-00-DX1.xml to TCGA-A2-A0CX-01Z-00-DX1.geojson
Converted TCGA-BH-A0B4-01Z-00-DX1.xml to TCGA-BH-A0B4-01Z-00-DX1.geojson
Converted TCGA-EW-A1P4-01Z-00-DX1.xml to TCGA-EW-A1P4-01Z-00-DX1.geojson
Converted TCGA-EW-A1PF-01Z-00-DX1.xml to TCGA-EW-A1