# Lake Levels ETL
## Data Scraping, Munging
---

In [2]:
import pandas as pd

from sqlalchemy import create_engine
from sqlalchemy import Column, Date, Integer, String, Float, Table

# import schema from python file
from schema_lake_levels import Base
from schema_lake_levels import Lake_names, Lake_levels

In [3]:
# read in csv file
lake_id_df = pd.read_csv("data/water-level/lakes/MCWD_Lake_ID.csv")

lake_id_df.head()

Unnamed: 0,LAKE_NAME,LAKE_ID,latitude,longitude
0,Hiawatha,27001800,44.921034,-93.236141
1,Mother,27002300,44.893298,-93.241013
2,Nokomis,27001900,44.908634,-93.242187
3,Taft,27068300,44.892951,-93.249752
4,Legion,27002400,44.88576,-93.26224


In [4]:
# make col names lowercase
lake_id_df.columns = ['name', 'id', 'lat', 'lng']

# drop duplicate ids
lake_id_df.drop_duplicates(subset='id', inplace=True)

# set 'lake_id' as index
lake_id_df.set_index('id', inplace=True)

# preview
lake_id_df.head()

Unnamed: 0_level_0,name,lat,lng
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27001800,Hiawatha,44.921034,-93.236141
27002300,Mother,44.893298,-93.241013
27001900,Nokomis,44.908634,-93.242187
27068300,Taft,44.892951,-93.249752
27002400,Legion,44.88576,-93.26224


In [35]:
# function to convert all lake data (df) for a given lake id into dict format
def lake_to_dict(lake_id, df):

    # grab all measurements for lake
    elevations = list(df.loc[df.id == lake_id].elevation)
    read_dates = list(df.loc[df.id == lake_id].read_date)
    datum_adjs = list(df.loc[df.id == lake_id].datum_adj)
    
    measurements = list(zip(elevations, read_dates, datum_adjs))

    measurement_keys = ['elevation', 'read_date', 'datum_adj']
    
    # make measurements json format
    measurements_json = []
    for i in range(len(measurements)):
        measurement_dict = dict(zip(measurement_keys, measurements[i]))
        measurements_json.append(measurement_dict)

    # create lake_dict using measurements_json
    lake_dict = {
        'name': lake_id_df['name'][lake_id],
        'id': lake_id,
        'location': {
            'lat': lake_id_df['lat'][lake_id],
            'lng': lake_id_df['lng'][lake_id]
        },
        'measurements': measurements_json
    }
    
    return lake_dict

In [6]:
# base url for scraping lake-level data
base_url = "https://files.dnr.state.mn.us/cgi-bin/lk_levels_dump.pl?format=csv&id="

# variable to track progress of loop
lake_count = 0

# store ids for lakes that failed and succeeded to be inserted
failed = []
successful = []

# list to store all dfs read-in to concatenate later
df_list = []

# loop through index of lake_id_df (lake ids are the index)
for lake_id in lake_id_df.index:
    
    lake_count += 1
    print(f"Getting data for lake {lake_count} of {len(lake_id_df.index)}.......")
    print(f"Lake id: {lake_id}.........")
    
    
    try:
        # read data from url into df
        df = pd.read_csv(f"{base_url}{lake_id}")

        # change column names to lowercase
        df.columns = map(str.lower, df.columns)

        # rename 'chr_id' to 'id'
        df.rename(columns={"chr_id": "id"}, inplace=True)

        # drop duplicate date entries
        df.drop_duplicates(subset='read_date', inplace=True)
        
        if len(df['id']) >= 10:
            
            df_list.append(df.copy())

        print("........................Success")
        print("------------------------------------------------------------------\n\n")
        successful.append(lake_id)
        
    except:
        print(".......................Process failed")
        print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n")
        failed.append(lake_id)

Getting data for lake 1 of 141.......
Lake id: 27001800.........
........................Success
------------------------------------------------------------------


Getting data for lake 2 of 141.......
Lake id: 27002300.........
........................Success
------------------------------------------------------------------


Getting data for lake 3 of 141.......
Lake id: 27001900.........
........................Success
------------------------------------------------------------------


Getting data for lake 4 of 141.......
Lake id: 27068300.........
........................Success
------------------------------------------------------------------


Getting data for lake 5 of 141.......
Lake id: 27002400.........
........................Success
------------------------------------------------------------------


Getting data for lake 6 of 141.......
Lake id: 27002200.........
........................Success
------------------------------------------------------------------


Gett

........................Success
------------------------------------------------------------------


Getting data for lake 51 of 141.......
Lake id: 27010900.........
........................Success
------------------------------------------------------------------


Getting data for lake 52 of 141.......
Lake id: 27008700.........
........................Success
------------------------------------------------------------------


Getting data for lake 53 of 141.......
Lake id: 27087000.........
........................Success
------------------------------------------------------------------


Getting data for lake 54 of 141.......
Lake id: 27013400.........
........................Success
------------------------------------------------------------------


Getting data for lake 55 of 141.......
Lake id: 27013303.........
........................Success
------------------------------------------------------------------


Getting data for lake 56 of 141.......
Lake id: 27087600........

........................Success
------------------------------------------------------------------


Getting data for lake 101 of 141.......
Lake id: 27094800.........
........................Success
------------------------------------------------------------------


Getting data for lake 102 of 141.......
Lake id: 27013315.........
........................Success
------------------------------------------------------------------


Getting data for lake 103 of 141.......
Lake id: 27013314.........
........................Success
------------------------------------------------------------------


Getting data for lake 104 of 141.......
Lake id: 27094700.........
........................Success
------------------------------------------------------------------


Getting data for lake 105 of 141.......
Lake id: 10004500.........
........................Success
------------------------------------------------------------------


Getting data for lake 106 of 141.......
Lake id: 10004100..

In [7]:
# concatenate data retrieved into new df
lake_levels_df = pd.concat(df_list)
lake_levels_df.head()

Unnamed: 0,id,elevation,read_date,datum_adj
0,27001800,811.4,1926-04-15,NGVD 29
1,27001800,815.35,1926-08-05,NGVD 29
2,27001800,812.72,1927-03-29,NGVD 29
3,27001800,813.04,1927-11-30,NGVD 29
4,27001800,814.5,1928-04-12,NGVD 29


In [51]:
# remove all data for lakes with less than 10 records

# empty list for storing ids of dropped lakes
dropped = []

print("Dropping lakes with fewer then 10 measurements....")
for lake_id in lake_id_df.index:
    
    # drop lakes with less than 10 measurements
    if len(lake_levels_df.loc[lake_levels_df['id'] == lake_id]['id']) < 10:
        lake_levels_df.drop(lake_levels_df[lake_levels_df['id'] == lake_id].index, inplace=True)
        dropped.append(lake_id)

print(f"Number of laked dropped: {len(dropped)}.")

# reset index
lake_levels_df = lake_levels_df.reset_index(drop=True)
lake_levels_df.head()

Dropping lakes with fewer then 10 measurements....
Number of laked dropped: 100.


Unnamed: 0,id,elevation,read_date,datum_adj
0,27001800,811.4,1926-04-15,NGVD 29
1,27001800,815.35,1926-08-05,NGVD 29
2,27001800,812.72,1927-03-29,NGVD 29
3,27001800,813.04,1927-11-30,NGVD 29
4,27001800,814.5,1928-04-12,NGVD 29


In [70]:
# write dataframes to csv's
lake_levels_df.to_csv("resources/scraped_lake_measurements.csv", index=False)
lake_id_df.to_csv("resources/lake_info_transformed.csv", index=True)

In [72]:
# format all lake data as JSON

# list of included lake_ids to iterate through
lake_id_list = lake_levels_df.groupby('id').all().index

# empty list for json format
lakes_json = []

# loop through included lake_ids
for lake_id in lake_id_list:
    lake_dict = lake_to_dict(lake_id, lake_levels_df)
    lakes_json.append(lake_dict)

In [76]:
# create list of lake names
lake_list = []

for id in lake_id_list:
    lake_list.append(lake_id_df['name'][id])

In [77]:
lake_dict = {}

for i in range(len(lake_list)):
    lake_dict[lake_list[i]] = lakes_json[i]

In [78]:
lake_dict['Nokomis']

{'name': 'Nokomis',
 'id': 27001900,
 'location': {'lat': 44.90863394, 'lng': -93.24218679},
 'measurements': [{'elevation': 816.7,
   'read_date': '1906-01-01',
   'datum_adj': 'NGVD 29'},
  {'elevation': 815.4, 'read_date': '1907-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 815.4, 'read_date': '1914-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 815.1, 'read_date': '1915-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 813.6, 'read_date': '1918-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 813.4, 'read_date': '1919-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 814.0, 'read_date': '1921-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 815.1, 'read_date': '1922-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 815.1, 'read_date': '1923-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 814.9, 'read_date': '1924-01-01', 'datum_adj': 'NGVD 29'},
  {'elevation': 814.96, 'read_date': '1925-03-26', 'datum_adj': 'NGVD 29'},
  {'elevation': 813.0, 'read_date': '1926-04-15', 'datum_ad