Notebook shows example of running drip_pipeline.

Currently this process grabs the latest release of both the American Rivers Dam Removal Database and the USGS Dam Removal Science Database.  It then combines the two datasets into a complete list of dam removals which supports the web application Dam Removal Information Portal or DRIP. 

In [1]:
import pandas as pd
import numpy as np
from pydrip import drip_dam
from pydrip import drip_sources

def get_data():
    #get latest American Rivers Data
    ar_url = drip_sources.get_american_rivers_data_url()
    american_rivers_df = drip_sources.read_american_rivers(ar_url)

    #get latest Dam Removal Science Data
    drd_url = drip_sources.get_science_data_url()
    dam_removal_science_df = drip_sources.read_science_data(drd_url)

    return american_rivers_df, dam_removal_science_df 

def build_drip_dams_table(dam_removal_science_df, american_rivers_df):
    #Select fields that contain dam information or american rivers id
    dam_science_df = drip_sources.get_science_subset(dam_removal_science_df, target='Dam')

    #For each dam in science database find best available data for the dam, first looking in science database and if null look in American Rivers
    all_dam_info = []
    for dam in dam_science_df.itertuples():
        removal_data = drip_dam.Dam(dam_id=dam.DamAccessionNumber)
        removal_data.science_data(dam)
        removal_data.update_missing_data(american_rivers_df)
        removal_data.add_geometry()
        all_dam_info.append(removal_data.__dict__)

    #For each dam only in American Rivers database, get AR data
    ar_only_dams = drip_sources.get_ar_only_dams(american_rivers_df, dam_science_df)
    for dam in ar_only_dams.itertuples():
        removal_data = drip_dam.Dam(dam_id=dam.AR_ID)
        removal_data.ar_dam_data(dam)
        removal_data.add_geometry()
        all_dam_info.append(removal_data.__dict__)

    all_dam_df = pd.DataFrame(all_dam_info)
    
    #select only records with geometery
    all_spatial_dam_df = all_dam_df[all_dam_df['geometry'].notna()]
    
    #Create GeoDataFrame, set crs
    #dams_gdf = gpd.GeoDataFrame(df, geometry=df['geometry'])
    #dams_gdf.crs = {'init':'epsg:4326'}     

    #export as csv
    #all_spatial_dam_df.to_csv('drip_dams.csv', sep=',', index=False)

    return all_dam_info


def main():
    #Get american rivers and dam removal science data into dataframes
    american_rivers_df, dam_removal_science_df = get_data()

    #Subset dam removal science to only dam data
    dam_science_df = drip_sources.get_science_subset(dam_removal_science_df, target='Dam')

    #Build JSON Representation of Drip Dams
    drip_dams = build_drip_dams_table(dam_removal_science_df, american_rivers_df)
    return drip_dams, dam_science_df, american_rivers_df


In [2]:
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [3]:
drip_dams, dam_science_df, american_rivers_df = main()

In [4]:
drip_dams

[{'_id': '1',
  'dam_source': 'Dam Removal Science',
  'ar_id': 'CA-055',
  'latitude': 38.234,
  'longitude': -121.0266,
  'dam_built_year': 1975,
  'dam_removed_year': 2003,
  'dam_height_ft': 9.84252,
  'dam_name': 'murphy creek dam',
  'stream_name': 'murphy creek',
  'dam_alt_name': ['sparrowk dam'],
  'stream_alt_name': [],
  'from_ar': [],
  'in_drd': 1,
  'science_id': '1',
  'geometry': <shapely.geometry.point.Point at 0x18bf7854488>},
 {'_id': '2',
  'dam_source': 'Dam Removal Science',
  'ar_id': None,
  'latitude': 42.8203,
  'longitude': -80.5081,
  'dam_built_year': None,
  'dam_removed_year': 1938,
  'dam_height_ft': None,
  'dam_name': 'croton dam',
  'stream_name': 'big creek',
  'dam_alt_name': [],
  'stream_alt_name': [],
  'from_ar': [],
  'in_drd': 1,
  'science_id': '2',
  'geometry': <shapely.geometry.point.Point at 0x18bf7868088>},
 {'_id': '3',
  'dam_source': 'Dam Removal Science',
  'ar_id': None,
  'latitude': 43.5676,
  'longitude': -80.6365,
  'dam_built_y

TESTS to CONSIDER

In [5]:
#tests if duplicates are occuring in dam_science_df, this might occur if incorrect columns are selected
def test_dup_dams(dam_science_df):
    '''
    Description
    -----------
    Test if duplicates are occuring in dam_science_df.
    This might occur if incorrect columns are selected in get_science_subset function
    
    Output
    -----------
    True: Duplicates exist, issue needs to be resolved
    False: No Duplicates, this test passes
    '''
    to_list = dam_science_df['DamAccessionNumber'].to_list()
    if len(to_list)==(len(list(set(to_list)))):
        return False
    else:
        return True
    
test_dup_dams(dam_science_df)

False

In [6]:
#tests if ar_ids are list in science that are no longer in ar
ar_in_science = dam_science_df['AR_ID'].dropna().to_list()
ar_ids = american_rivers_df['AR_ID'].to_list()
[i for i in ar_in_science if i not in ar_ids]

['OR-027']

In [None]:
# tests if science dam without AR_ID is near an AR dam

In [7]:
# show dam removal science records that are getting extra data from american rivers (have missing data)
selection = [i for i in drip_dams if i['dam_source'] == 'Dam Removal Science' and len(i['from_ar'])>0]

df = pd.DataFrame(selection)
df.to_csv('dams_missing_info.csv', sep=',', index=False)