In [6]:
#!pip install fastkml

Collecting fastkml
  Downloading fastkml-1.1.0-py3-none-any.whl.metadata (8.0 kB)
Collecting pygeoif>=1.5 (from fastkml)
  Downloading pygeoif-1.5.1-py3-none-any.whl.metadata (14 kB)
Downloading fastkml-1.1.0-py3-none-any.whl (107 kB)
Downloading pygeoif-1.5.1-py3-none-any.whl (28 kB)
Installing collected packages: pygeoif, fastkml
Successfully installed fastkml-1.1.0 pygeoif-1.5.1


In [18]:
!pip install lxml



In [19]:
import xml.etree.ElementTree as ET
import csv
import os
import sys
from xml.dom import minidom

def extract_kml_to_csv(kml_path, csv_path):
    print(f"Attempting to read KML file from: {os.path.abspath(kml_path)}")
    
    # Check if file exists and is readable
    if not os.path.exists(kml_path):
        print(f"ERROR: The KML file does not exist at {os.path.abspath(kml_path)}")
        return False
    
    if not os.access(kml_path, os.R_OK):
        print(f"ERROR: The KML file exists but is not readable. Check permissions.")
        return False
    
    # Analyze the file structure first
    print("Analyzing KML file structure...")
    try:
        with open(kml_path, 'r', encoding='utf-8') as f:
            content = f.read()
            print(f"File size: {len(content)} bytes")
            
            # Print a sample of the content to help debug
            sample = content[:500]
            print(f"First 500 characters of file: {sample}")
            
            # Check if it contains typical KML elements
            has_kml_tag = '<kml' in content
            has_placemark = '<Placemark' in content
            has_document = '<Document' in content
            has_folder = '<Folder' in content
            
            print(f"Has KML tag: {has_kml_tag}")
            print(f"Has Placemark tags: {has_placemark}")
            print(f"Has Document tag: {has_document}")
            print(f"Has Folder tag: {has_folder}")
        
        # Try parsing with minidom first to better understand the structure
        dom = minidom.parse(kml_path)
        
        # Print all element names to understand the structure
        element_names = set()
        for elem in dom.getElementsByTagName("*"):
            element_names.add(elem.tagName)
        
        print("Elements found in the file:")
        print(', '.join(element_names))
        
        # Count Placemarks
        placemarks = dom.getElementsByTagName("Placemark")
        print(f"Found {len(placemarks)} Placemarks using DOM parser")
        
        # Now parse with ElementTree for data extraction
        tree = ET.parse(kml_path)
        root = tree.getroot()
        
        # Try to determine the namespace
        root_tag = root.tag
        print(f"Root tag: {root_tag}")
        
        # Define possible namespaces
        namespaces = {
            'default': {},
            'kml': {'kml': 'http://www.opengis.net/kml/2.2'},
            'kml21': {'kml': 'http://earth.google.com/kml/2.1'},
            'kml22': {'kml': 'http://www.opengis.net/kml/2.2'},
            'atom': {'atom': 'http://www.w3.org/2005/Atom'}
        }
        
        # Open CSV file for writing
        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Stadium Name', 'Team', 'Address', 'City', 'State', 'Zip Code', 'Full Address']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            # Try each namespace to find placemarks
            placemarks = []
            used_ns = None
            
            for ns_name, ns in namespaces.items():
                # Try to find Placemarks at different levels of the hierarchy
                for xpath in [
                    './/Placemark',
                    './/kml:Placemark',
                    './/Document/Placemark',
                    './/kml:Document/kml:Placemark',
                    './/Folder/Placemark',
                    './/kml:Folder/kml:Placemark'
                ]:
                    try:
                        found = root.findall(xpath, ns)
                        if found:
                            placemarks = found
                            used_ns = ns
                            print(f"Found {len(placemarks)} Placemarks using XPath '{xpath}' with namespace {ns_name}")
                            break
                    except Exception as e:
                        pass  # Ignore errors for invalid XPaths
                
                if placemarks:
                    break
            
            if not placemarks:
                print("No Placemarks found in the KML file using any standard method.")
                
                # Last resort: try to find all elements that might be Placemarks
                print("Trying to manually traverse the XML structure...")
                
                def find_placemarks(element, path=""):
                    found = []
                    new_path = path + "/" + element.tag.split("}")[-1]
                    
                    # Check if this element looks like a Placemark
                    children_tags = [child.tag.split("}")[-1] for child in element]
                    if "name" in children_tags and ("address" in children_tags or "ExtendedData" in children_tags):
                        print(f"Found potential Placemark-like element at {new_path}")
                        found.append(element)
                    
                    # Recursively check children
                    for child in element:
                        found.extend(find_placemarks(child, new_path))
                    
                    return found
                
                placemarks = find_placemarks(root)
                print(f"Found {len(placemarks)} potential Placemark-like elements")
            
            if not placemarks:
                print("Could not find any Placemark elements in the file.")
                return False
            
            # Process each Placemark
            processed_count = 0
            for placemark in placemarks:
                stadium_data = {
                    'Stadium Name': '',
                    'Team': '',
                    'Address': '',
                    'City': '',
                    'State': '',
                    'Zip Code': '',
                    'Full Address': ''
                }
                
                # Debug info
                print(f"\nProcessing Placemark {processed_count + 1}:")
                
                # Try different ways to get the name
                for name_path in ['name', 'kml:name', './/name', './/kml:name']:
                    try:
                        name_elem = placemark.find(name_path, used_ns)
                        if name_elem is not None and name_elem.text:
                            stadium_data['Stadium Name'] = name_elem.text
                            print(f"Found name: {name_elem.text}")
                            break
                    except:
                        continue
                
                # Try different ways to get the address
                for addr_path in ['address', 'kml:address', './/address', './/kml:address']:
                    try:
                        addr_elem = placemark.find(addr_path, used_ns)
                        if addr_elem is not None and addr_elem.text:
                            stadium_data['Full Address'] = addr_elem.text
                            print(f"Found address: {addr_elem.text}")
                            break
                    except:
                        continue
                
                # Try to get ExtendedData
                extended_data = None
                for ed_path in ['ExtendedData', 'kml:ExtendedData', './/ExtendedData', './/kml:ExtendedData']:
                    try:
                        ed = placemark.find(ed_path, used_ns)
                        if ed is not None:
                            extended_data = ed
                            print("Found ExtendedData")
                            break
                    except:
                        continue
                
                if extended_data is not None:
                    # Try different ways to get Data elements
                    data_elements = []
                    for data_path in ['Data', 'kml:Data', './/Data', './/kml:Data']:
                        try:
                            elements = extended_data.findall(data_path, used_ns)
                            if elements:
                                data_elements = elements
                                print(f"Found {len(elements)} Data elements")
                                break
                        except:
                            continue
                    
                    # Process each Data element
                    for data in data_elements:
                        try:
                            data_name = data.get('name')
                            
                            # Try different ways to get the value
                            value_text = None
                            for value_path in ['value', 'kml:value', './/value', './/kml:value']:
                                try:
                                    value_elem = data.find(value_path, used_ns)
                                    if value_elem is not None and value_elem.text is not None:
                                        value_text = value_elem.text
                                        break
                                except:
                                    continue
                            
                            if data_name and value_text:
                                print(f"Data: {data_name} = {value_text}")
                                
                                if data_name == 'Team':
                                    stadium_data['Team'] = value_text
                                elif data_name == 'addresss' or data_name == 'address' or data_name == 'Address':
                                    stadium_data['Address'] = value_text
                                elif data_name == 'City':
                                    stadium_data['City'] = value_text
                                elif data_name == 'State':
                                    stadium_data['State'] = value_text.strip()
                                elif data_name == 'Zip Code' or data_name == 'Zip' or data_name == 'ZipCode':
                                    stadium_data['Zip Code'] = value_text
                        except Exception as e:
                            print(f"Error processing Data element: {e}")
                
                # If we found any data, write it to the CSV
                if any(stadium_data.values()):
                    writer.writerow(stadium_data)
                    processed_count += 1
                    print(f"Wrote data for {stadium_data['Stadium Name']}")
                else:
                    print("No data found for this Placemark")
            
            print(f"\nSuccessfully processed {processed_count} out of {len(placemarks)} Placemarks")
            print(f"Data written to {os.path.abspath(csv_path)}")
            
            if processed_count == 0:
                print("WARNING: No data was written to the CSV file!")
                return False
                
            return True
    
    except ET.ParseError as pe:
        print(f"Error parsing XML/KML file: {pe}")
        print("This might not be a valid KML file or it has syntax errors.")
        return False
    except Exception as e:
        print(f"Unexpected error processing KML file: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    # Try different possible paths for the KML file
    possible_paths = [
        "merged/data/temp_unzipped/doc.kml",
        "doc.kml",
        "data/doc.kml",
        "temp_unzipped/doc.kml",
        # Add more potential paths if needed
    ]
    
    csv_file = "49ers_fan_chapters.csv"
    
    # Allow command line argument for the KML path
    if len(sys.argv) > 1:
        possible_paths.insert(0, sys.argv[1])
    
    success = False
    for path in possible_paths:
        print(f"Trying path: {path}")
        if os.path.exists(path):
            success = extract_kml_to_csv(path, csv_file)
            if success:
                break
    
    if not success:
        print("\nCould not successfully process any of the tried KML paths.")
        print("Please provide the correct path to the KML file as a command line argument:")
        print(f"python {sys.argv[0]} path/to/your/kml/file.kml")

if __name__ == "__main__":
    main()

Trying path: --f=/Users/alex.liss/Library/Jupyter/runtime/kernel-v3452aeb21f01dc22f1ba4a44eaf93aa20d3e82e83.json
Trying path: merged/data/temp_unzipped/doc.kml
Trying path: doc.kml
Trying path: data/doc.kml
Trying path: temp_unzipped/doc.kml
Attempting to read KML file from: /Users/alex.liss/Documents/1_DS AI LIVE /NEW MERGED PROJECT/merged/data/temp_unzipped/doc.kml
Analyzing KML file structure...
File size: 1996505 bytes
First 500 characters of file: <?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">
  <Document>
    <name>49ers Fan Chapters</name>
    <description/>
    <Style id="icon-1698-BDBDBD-normal">
      <IconStyle>
        <scale>1</scale>
        <Icon>
          <href>images/icon-1.png</href>
        </Icon>
      </IconStyle>
      <LabelStyle>
        <scale>0</scale>
      </LabelStyle>
    </Style>
    <Style id="icon-1698-BDBDBD-highlight">
      <IconStyle>
        <scale>1</scale>
        <
Has KML tag: True
Has Placemark tags: True