## Parse the lat/lon coordinates from the Utah Avalanche Center's web site

In [1]:
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import Request, urlopen
from shapely.geometry import Point, Polygon
import re
import pandas as pd
import geopandas as gpd
import earthpy as et

# set working dir
os.chdir(os.path.join(et.io.HOME, "earth-analytics"))

## Define Functions

In [2]:
def getHTML(link):
    """
    Captures the HTML source code from a web site.

    Parameters
    ----------
    link : string
        A url to a web site

    Returns
    -------
    r_text : string
        The source code for the HTML page
    """

    # Get HTML source code from website
    q = Request(link)
    q.add_header('User-Agent', 'Mozilla/5.0')
    r = urlopen(q).read()
    r_text = r.decode("utf-8")
    return r_text


def getCoords(text):
    """
    Parse the lat/lon coordinates from an avalanche's web page on the Utah Avalanche Center website.
    Sets the coordinates to a missing value of -999 if none are available from the web page.

    Parameters
    ----------
    text : string
        HTML source code

    Returns
    -------
    coords[1] : string
        latitude
    coords[2] : string
        longitude
    """

    # Use regex to pull out lat and lon from HTML
    m = re.search('wkt":"POINT \((.+?)\)","projection', text)
    if m is not None:
        # Get values
        coords = m.group(1).split()
    else:
        # Set a missing value
        coords = [-999, -999]

    # Return the latitude and longitude
    return coords[1], coords[0]


def parseHTML(html, term1, term2):
    """
    Parses text that is between two search terms in HTML source code.
    If search rerturns nothing, a missing value of -999 is set.

    Parameters
    ----------
    html : string
        HTML source 
    term1 : string
        Opening search term 
    term2 : string
        Closing search term

    Returns
    -------
    text : string
        The text that is between the two search terms
    """

    m = re.search(term1+"(.+?)"+term2, html)
    if m is not None:
        # Get value
        text = m.group(1)
    else:
        # Set a missing value
        text = "-999"

    return text


def getAviData(url):
    """
    Parses lat/lon, elevation, slope angle, vertical, and aspect avalanche stats
    from an avalanche's web page on the Utah Avalanche Center (UAC) website.

    Parameters
    ----------
    url : string
        URL from the UAC website of an avalanche occurrence

    Returns
    -------
    data_list : 
        Contains lat, lon, elevation, slope_angle, vertical, and aspect in that order
    """

    # Get the HTML source code form the url
    source_code_text = getHTML(url)

    # Parse lat/lon
    lat, lon = getCoords(source_code_text)

    # Parse elevation
    search_term_1 = 'Elevation</div>\n        <div class="text_02 mb2">'
    search_term_2 = '</div>'
    elevation = parseHTML(source_code_text, search_term_1, search_term_2)

    # Parse slope angle
    search_term_3 = 'Slope Angle</div>\n        <div class="text_02 mb2">'
    search_term_4 = '</div>'
    slope_angle = parseHTML(source_code_text, search_term_3, search_term_4)

    # Parse vertical
    search_term_5 = 'Vertical</div>\n        <div class="text_02 mb2">'
    search_term_6 = '</div>'
    vertical = parseHTML(source_code_text, search_term_5, search_term_6)

    # Parse aspect
    search_term_7 = 'Aspect</div>\n        <div class="text_02 mb2">'
    search_term_8 = '</div>'
    aspect = parseHTML(source_code_text, search_term_7, search_term_8)

    # Write data to dataframe
    data_list = [lat, lon, elevation, slope_angle, vertical, aspect]

    return data_list


def containsPoint(latitude, longitude, boundary):
    """
    Determines if a point is within a shalpefile boundary

    Parameters
    ----------
    latitude : float
        Latitude of the point you want to check
    longitude : float
        Longitude of the point you want to check
    boundary : geopandas object
        Dataframe with one shapefile boundary

    Returns
    -------
    boolean : boolean
        Returns True if the point falls within the boundary and returns False if it does not
    """

    # Conver the lat/lon to a geopandas dataframe
    lat_lon_point = np.array([[longitude, latitude]])
    geometry = [Point(xy) for xy in lat_lon_point]
    point_loc = gpd.GeoDataFrame(geometry,
                                 columns=['geometry'],
                                 crs={'init': 'epsg:4326'})

    # Check if the point is within the boundary
    boolean = boundary.contains(point_loc)[0]

    return boolean

In [3]:
# Create data and output file paths
data_fp = os.path.join('data', 'final-project',
                       'cottonwood-heights-utah', 'avalanche-data')
output_fp = "output"

# Get list of final output files if they exist
final_csv_fn = glob(os.path.join(data_fp, output_fp + '/site-cleaned*.csv'))
final_csv_fn.sort()

# Get list of paths for each input csv file and sort
input_csv_fn = glob(os.path.join(data_fp + '/*search*.csv'))
input_csv_fn.sort()

# Open site boundary and reproject to EPSG 4326
site_boundary_path = os.path.join(
    'data', 'final-project', 'cottonwood-heights-utah', 'vector-clip', 'utah-avalanche-clip.shp')
avalanche_boundary = gpd.read_file(site_boundary_path)
avalanche_boundary_4326 = avalanche_boundary.to_crs(epsg=4326)

if len(input_csv_fn) == len(final_csv_fn):
    # Nothing to do. All ouput files exist
    print("All output files exist")

else:
    # Need to process data to create some or all output files
    # Loop through each input csv file
    for file in input_csv_fn:
        # Get year from the file name
        year = file[93:97]

        # Build final csv output file name that will contain data within the site boundary only
        final_output_fn = os.path.join(
            data_fp, output_fp, "site-cleaned-utah-avalanche-data-" + year + ".csv")

        # Check if final file for current year already exists, if not, create it
        if os.path.exists(final_output_fn) == False:
            # Open input file and get the URLs from it
            avalanche_df = pd.read_csv(file)
            urls_np = avalanche_df.loc[:, 'URL'].values

            # Create an empty dataframe to capture all the parsed data outputs form the HTML source code
            avi_data_df = pd.DataFrame(
                columns=['lat', 'lon', 'elevation', 'slope_angle', 'vertical', 'aspect'])

            # Loop through each url and parse out the lat/lon, elevation, slope angle, vertical, and aspect
            # from the HTML source code
            for url in urls_np:
                avi_data_list = getAviData(url)

                # Create dataframe from list
                avi_data_df.loc[len(avi_data_df)] = avi_data_list

            # Concatenate the avalanche_df with the avi_data_df
            all_avi_data_df = pd.concat([avalanche_df, avi_data_df], axis=1)

            all_avi_data_np = all_avi_data_df[['Date', 'Specific Region', 'General Region', 'Trigger',
                                               'Depth (ft)', 'Width (ft)', 'URL', 'lat', 'lon', 'elevation',
                                               'slope_angle', 'vertical', 'aspect']].values

            # Loop through each lat/lon pair and see if it falls within the site boundary
            avi_in_site_list = []
            for avi_info in all_avi_data_np:
                boolean = containsPoint(float(avi_info[7]), float(
                    avi_info[8]), avalanche_boundary_4326)
                if boolean:
                    avi_in_site_list.append([avi_info[0], avi_info[1], avi_info[2], avi_info[3], avi_info[4],
                                             avi_info[5], avi_info[6], avi_info[7], avi_info[8], avi_info[9],
                                             avi_info[10], avi_info[11], avi_info[12]])

            # Create pandas dataframe from list
            labels = ['date', 'specific_region', 'general_region', 'trigger', 'depth_ft',
                      'width_ft', 'url', 'lat', 'lon', 'elevation_ft', 'slope_angle_deg', 'vertical_ft', 'aspect']
            avi_in_site_df = pd.DataFrame.from_records(
                avi_in_site_list, columns=labels)

            # Write the avalanche data that is within the site to a csv file
            avi_in_site_df.to_csv(final_output_fn, index=False)
        else:
            print("Final data for " + year +
                  " alreday exists. No processing needed.")

All output files exist
