# Leveling Arrays

UCSB

https://geodesy.projects.geol.ucsb.edu/level_lines/level_regions.html 

In [12]:
import urllib.request
from html.parser import HTMLParser
import html as html_module  # To decode entities like &deg;
import re
import csv
import time

BASE_URL = "https://geodesy.projects.geol.ucsb.edu/level_lines/"
MAIN_PAGE = BASE_URL + "level_regions.html"

# --- Fetch URL content ---
def fetch_url(url):
    with urllib.request.urlopen(url) as resp:
        return resp.read().decode("utf-8", errors="ignore")

# --- Parse subpage links from main page ---
class SubpageLinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.links = []
    def handle_starttag(self, tag, attrs):
        if tag.lower() == 'a':
            href = dict(attrs).get('href', '')
            if href.startswith("X") and href.lower().endswith(".html"):
                self.links.append(href)

def extract_subpage_links(html):
    parser = SubpageLinkParser()
    parser.feed(html)
    return list(set(parser.links))  # Remove duplicates

# --- Clean HTML line (strip tags, decode entities) ---
class TextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.result = []
    def handle_data(self, data):
        self.result.append(data)
    def get_text(self):
        return ''.join(self.result)

def clean_html_line(line):
    parser = TextExtractor()
    parser.feed(line)
    return html_module.unescape(parser.get_text().strip())

# --- DMS to Decimal Degrees ---
def dms_to_dd(deg, minutes, seconds, direction=None):
    dd = float(deg) + float(minutes)/60 + float(seconds)/3600
    if direction and direction.upper() in ['S', 'W']:
        dd = -dd
    return round(dd, 6)

# --- Parse a block of text for DMS coordinates ---
def parse_lat_lon_block(text):
    lat_dd = lon_dd = None

    # Latitude pattern
    lat_re = re.compile(r'([NS])?\s*([0-9]{1,2})°\s*([0-9]{1,2})\'\s*([0-9]{1,2})"?\s*([NS])?', re.IGNORECASE)
    lon_re = re.compile(r'([EW])?\s*([0-9]{2,3})°\s*([0-9]{1,2})\'\s*([0-9]{1,2})"?\s*([EW])?', re.IGNORECASE)

    mlat = lat_re.search(text)
    if mlat:
        direction = mlat.group(1) or mlat.group(5) or 'N'
        lat_dd = dms_to_dd(mlat.group(2), mlat.group(3), mlat.group(4), direction)

    mlon = lon_re.search(text)
    if mlon:
        direction = mlon.group(1) or mlon.group(5) or 'W'
        lon_dd = dms_to_dd(mlon.group(2), mlon.group(3), mlon.group(4), direction)

    return lat_dd, lon_dd

# --- Extract lat/lon from messy HTML ---
def extract_lat_lon_from_html(html):
    lines = html.splitlines()
    lat_dd = lon_dd = None
    i = 0
    while i < len(lines):
        line = clean_html_line(lines[i])
        next_line = clean_html_line(lines[i+1]) if i+1 < len(lines) else ""

        if 'Latitude' in line and lat_dd is None:
            lat_dd, _ = parse_lat_lon_block(line)
            if lat_dd is None and next_line:
                lat_dd, _ = parse_lat_lon_block(next_line)

        if 'Longitude' in line and lon_dd is None:
            _, lon_dd = parse_lat_lon_block(line)
            if lon_dd is None and next_line:
                _, lon_dd = parse_lat_lon_block(next_line)

        if lat_dd is not None and lon_dd is not None:
            break
        i += 1

    return lat_dd, lon_dd

# --- Main script ---
def main():
    print("Fetching main page…")
    main_html = fetch_url(MAIN_PAGE)
    subpages = extract_subpage_links(main_html)
    print(f"Found {len(subpages)} subpages.")

    results = []
    for page in subpages:
        url = BASE_URL + page
        print(f"Processing: {page}")
        try:
            html = fetch_url(url)
            lat_dd, lon_dd = extract_lat_lon_from_html(html)
            results.append({
                "name": page,
                "url": url,
                "latitude_dd": lat_dd if lat_dd is not None else "N/A",
                "longitude_dd": lon_dd if lon_dd is not None else "N/A"
            })
        except Exception as e:
            print(f"  Error processing {page}: {e}")
        time.sleep(0.2)  # Be nice to the server

    # Write to CSV
    with open("level_lines_lat_lon_dd.csv", "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["name", "url", "latitude_dd", "longitude_dd"])
        writer.writeheader()
        for row in results:
            writer.writerow(row)

    print("Done! Output written to level_lines_lat_lon_dd.csv")

if __name__ == "__main__":
    main()



Fetching main page…
Found 63 subpages.
Processing: X0004_ANZA.html
Processing: X0027_JUNCAL.html
Processing: X0031_LEWIS_CREEK.html
Processing: X0041_MUSTANG_GRADE.html
Processing: X0005_ARTISTS_DRIVE.html
Processing: X0008_BIG_TUJUNGA.html
Processing: X0016_DAY_CANYON.html
Processing: X0030_KOEHN_LAKE.html
Processing: X0002_47TH_STREET_EAST.html
Processing: X0037_MESA_VALLEY.html
Processing: X0064_YMCA.html
Processing: X0012_CAMP_DIX.html
Processing: X0011_CAMERON.html
Processing: X0045_PAINTED_CANYON.html
Processing: X0042_NEBO.html
Processing: X0054_SB_VALLEY_COLLEGE.html
Processing: X0053_SAN_JUAN_BAUTISTA.html
Processing: X0025_HECTOR.html
Processing: X0050_PORTESUELO.html
Processing: X0038_MINA.html
Processing: X0058_THOUSAND_PALMS.html
Processing: X0026_JPL.html
Processing: X0051_SANTA_ANITA_CANYON.html
Processing: X0019_FISH_LAKE_VALLEY.html
Processing: X0010_CABALLO.html
Processing: X0036_McGEE_CREEK.html
Processing: X0022_GRAND_TETON.html
Processing: X0062_WALLACE_CREEK.html


In [13]:
# Convert csv to geojson
import csv
import json

def csv_to_geojson(csv_path, geojson_path):
    features = []

    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                lat = float(row["latitude_dd"])
                lon = float(row["longitude_dd"])
            except (ValueError, KeyError):
                continue  # Skip rows with invalid coordinates

            feature = {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [lon, lat],  # GeoJSON uses [lon, lat]
                },
                "properties": {
                    "name": row.get("name", ""),
                    "url": row.get("url", "")
                }
            }
            features.append(feature)

    geojson = {
        "type": "FeatureCollection",
        "features": features
    }

    with open(geojson_path, "w", encoding='utf-8') as f:
        json.dump(geojson, f, indent=2)

    print(f"✅ GeoJSON written to: {geojson_path}")

# Example usage
csv_to_geojson("level_lines_lat_lon_dd.csv", "level_lines.geojson")


✅ GeoJSON written to: level_lines.geojson
