# Lake Levels ETL
## Data Scraping, Munging
---
### Outputs CSV files to be read-in by an app running flask

In [53]:
import pandas as pd
from flask import jsonify

In [35]:
# read in csv file
lake_id_df = pd.read_csv("data/water-level/lakes/MCWD_Lake_ID.csv")

lake_id_df.head()

Unnamed: 0,LAKE_NAME,LAKE_ID,latitude,longitude
0,Hiawatha,27001800,44.921034,-93.236141
1,Mother,27002300,44.893298,-93.241013
2,Nokomis,27001900,44.908634,-93.242187
3,Taft,27068300,44.892951,-93.249752
4,Legion,27002400,44.88576,-93.26224


In [36]:
# make col names lowercase
lake_id_df.columns = ['name', 'id', 'lat', 'lng']

# drop duplicate ids
lake_id_df.drop_duplicates(subset='id', inplace=True)

# set 'lake_id' as index
lake_id_df.set_index('id', drop=False, inplace=True)

# preview
lake_id_df.head()

Unnamed: 0_level_0,name,id,lat,lng
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
27001800,Hiawatha,27001800,44.921034,-93.236141
27002300,Mother,27002300,44.893298,-93.241013
27001900,Nokomis,27001900,44.908634,-93.242187
27068300,Taft,27068300,44.892951,-93.249752
27002400,Legion,27002400,44.88576,-93.26224


In [37]:
# function to convert all lake data (df) for a given lake id into dict format
def lake_to_dict(lake_id, df):

    # grab all measurements for lake
    elevations = list(df.loc[df.id == lake_id].elevation)
    read_dates = list(df.loc[df.id == lake_id].read_date)
    datum_adjs = list(df.loc[df.id == lake_id].datum_adj)
    
    measurements = list(zip(elevations, read_dates, datum_adjs))

    measurement_keys = ['elevation', 'read_date', 'datum_adj']
    
    # make measurements json format
    measurements_json = []
    for i in range(len(measurements)):
        measurement_dict = dict(zip(measurement_keys, measurements[i]))
        measurements_json.append(measurement_dict)

    # create lake_dict using measurements_json
    lake_dict = {
        'name': lake_id_df['name'][lake_id],
        'id': lake_id,
        'location': {
            'lat': lake_id_df['lat'][lake_id],
            'lng': lake_id_df['lng'][lake_id]
        },
        'measurements': measurements_json
    }
    
    return lake_dict

In [38]:
# base url for scraping lake-level data
base_url = "https://files.dnr.state.mn.us/cgi-bin/lk_levels_dump.pl?format=csv&id="

# variable to track progress of loop
lake_count = 0

# store ids for lakes that failed and succeeded to be inserted
failed = []
successful = []

# list to store all dfs read-in to concatenate later
df_list = []

# loop through index of lake_id_df (lake ids are the index)
for lake_id in lake_id_df.index:
    
    lake_count += 1
    print(f"Getting data for lake {lake_count} of {len(lake_id_df.index)}.......")
    print(f"Lake id: {lake_id}.........")
    
    
    try:
        # read data from url into df
        df = pd.read_csv(f"{base_url}{lake_id}")

        # change column names to lowercase
        df.columns = map(str.lower, df.columns)

        # rename 'chr_id' to 'id'
        df.rename(columns={"chr_id": "id"}, inplace=True)

        # drop duplicate date entries
        df.drop_duplicates(subset='read_date', inplace=True)
        
        if len(df['id']) >= 10:
            
            df_list.append(df.copy())

        print("........................Success")
        print("------------------------------------------------------------------\n\n")
        successful.append(lake_id)
        
    except:
        print(".......................Process failed")
        print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n")
        failed.append(lake_id)

Getting data for lake 1 of 141.......
Lake id: 27001800.........
........................Success
------------------------------------------------------------------


Getting data for lake 2 of 141.......
Lake id: 27002300.........
........................Success
------------------------------------------------------------------


Getting data for lake 3 of 141.......
Lake id: 27001900.........
........................Success
------------------------------------------------------------------


Getting data for lake 4 of 141.......
Lake id: 27068300.........
........................Success
------------------------------------------------------------------


Getting data for lake 5 of 141.......
Lake id: 27002400.........
........................Success
------------------------------------------------------------------


Getting data for lake 6 of 141.......
Lake id: 27002200.........
........................Success
------------------------------------------------------------------


Gett

........................Success
------------------------------------------------------------------


Getting data for lake 51 of 141.......
Lake id: 27010900.........
........................Success
------------------------------------------------------------------


Getting data for lake 52 of 141.......
Lake id: 27008700.........
........................Success
------------------------------------------------------------------


Getting data for lake 53 of 141.......
Lake id: 27087000.........
........................Success
------------------------------------------------------------------


Getting data for lake 54 of 141.......
Lake id: 27013400.........
........................Success
------------------------------------------------------------------


Getting data for lake 55 of 141.......
Lake id: 27013303.........
........................Success
------------------------------------------------------------------


Getting data for lake 56 of 141.......
Lake id: 27087600........

........................Success
------------------------------------------------------------------


Getting data for lake 101 of 141.......
Lake id: 27094800.........
........................Success
------------------------------------------------------------------


Getting data for lake 102 of 141.......
Lake id: 27013315.........
........................Success
------------------------------------------------------------------


Getting data for lake 103 of 141.......
Lake id: 27013314.........
........................Success
------------------------------------------------------------------


Getting data for lake 104 of 141.......
Lake id: 27094700.........
........................Success
------------------------------------------------------------------


Getting data for lake 105 of 141.......
Lake id: 10004500.........
........................Success
------------------------------------------------------------------


Getting data for lake 106 of 141.......
Lake id: 10004100..

In [39]:
# concatenate data retrieved into new df
lake_levels_df = pd.concat(df_list)
lake_levels_df.head()

Unnamed: 0,id,elevation,read_date,datum_adj
0,27001800,811.4,1926-04-15,NGVD 29
1,27001800,815.35,1926-08-05,NGVD 29
2,27001800,812.72,1927-03-29,NGVD 29
3,27001800,813.04,1927-11-30,NGVD 29
4,27001800,814.5,1928-04-12,NGVD 29


In [40]:
# remove all data for lakes with less than 10 records

# empty list for storing ids of dropped lakes
dropped = []

print("Dropping lakes with fewer then 10 measurements....")
for lake_id in lake_id_df.index:
    
    # drop lakes with less than 10 measurements
    if len(lake_levels_df.loc[lake_levels_df['id'] == lake_id]['id']) < 10:
        lake_levels_df.drop(lake_levels_df[lake_levels_df['id'] == lake_id].index, inplace=True)
        dropped.append(lake_id)

print(f"Number of laked dropped: {len(dropped)}.")

# reset index
lake_levels_df = lake_levels_df.reset_index(drop=True)
lake_levels_df.head()

Dropping lakes with fewer then 10 measurements....
Number of laked dropped: 100.


Unnamed: 0,id,elevation,read_date,datum_adj
0,27001800,811.4,1926-04-15,NGVD 29
1,27001800,815.35,1926-08-05,NGVD 29
2,27001800,812.72,1927-03-29,NGVD 29
3,27001800,813.04,1927-11-30,NGVD 29
4,27001800,814.5,1928-04-12,NGVD 29


In [41]:
# remove same lakes from lake_id_df
for lake_id in dropped:
    lake_id_df.drop(labels=lake_id, inplace=True)

In [42]:
lake_id_df.count()

name    41
id      41
lat     41
lng     41
dtype: int64

In [33]:
# write dataframes to csv's
lake_levels_df.to_csv("resources/scraped_lake_measurements.csv", index=False)
lake_id_df.to_csv("resources/lake_info_transformed.csv", index=True)

In [72]:
# format all lake data as JSON

# list of included lake_ids to iterate through
lake_id_list = lake_levels_df.groupby('id').all().index

# empty list for json format
lakes_json = []

# loop through included lake_ids
for lake_id in lake_id_list:
    lake_dict = lake_to_dict(lake_id, lake_levels_df)
    lakes_json.append(lake_dict)

In [76]:
# create list of lake names
lake_list = []

for id in lake_id_list:
    lake_list.append(lake_id_df['name'][id])

In [77]:
lake_dict = {}

for i in range(len(lake_list)):
    lake_dict[lake_list[i]] = lakes_json[i]

In [75]:
df = pd.read_csv("data/water-level/streams/CLO01_2017-2018_TrollData.csv")
df['id'] = 'CLO01'
df.head()

Unnamed: 0,Date,Time,Stage [ft],Quality,id
0,4/5/2017,9:24:00 AM,---,missing,CLO01
1,4/5/2017,9:40:00 AM,944.73,120 (Suspect),CLO01
2,4/5/2017,9:55:00 AM,944.73,"120 (Suspect), ed",CLO01
3,4/5/2017,10:10:00 AM,944.74,"120 (Suspect), ed",CLO01
4,4/5/2017,10:25:00 AM,944.74,"120 (Suspect), ed",CLO01


In [84]:
creek_ids = ['CGL01', 'CLO01', 'CLO08', 'CLO09', 'CMH01', 'CPA01']

creek_df_list = []

for creek in creek_ids:
    df = pd.read_csv(f"data/water-level/streams/{creek}_2017-2018_TrollData.csv")
    df['id'] = creek
    
    creek_df_list.append(df)
    
creek_data_df = pd.concat(creek_df_list)
creek_data_df.dropna(how='any', inplace=True)
creek_data_df.head()

Unnamed: 0,Date,Time,Stage [ft],Quality,id
0,4/7/2017,8:51:00 AM,943.9,200 (Unknown),CGL01
1,4/7/2017,9:06:00 AM,943.91,200 (Unknown),CGL01
2,4/7/2017,9:21:00 AM,943.92,200 (Unknown),CGL01
3,4/7/2017,9:36:00 AM,943.9,200 (Unknown),CGL01
4,4/7/2017,9:51:00 AM,943.91,200 (Unknown),CGL01


In [85]:
creek_data_df = creek_data_df[['Date', 'Time', 'id', 'Stage [ft]']]

creek_data_df.columns = ['date', 'time', 'id', 'elevation']
creek_data_df.head()

Unnamed: 0,date,time,id,elevation
0,4/7/2017,8:51:00 AM,CGL01,943.9
1,4/7/2017,9:06:00 AM,CGL01,943.91
2,4/7/2017,9:21:00 AM,CGL01,943.92
3,4/7/2017,9:36:00 AM,CGL01,943.9
4,4/7/2017,9:51:00 AM,CGL01,943.91


In [100]:
cl_df = creek_data_df[creek_data_df['elevation'] != ' ---']
cl_df.reset_index(drop=True, inplace=True)

In [101]:
cl_df.elevation = cl_df.elevation.astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [114]:
# write out csv of dataframe with all creek levels
cl_df.to_csv("resources/all_creek_levels.csv", index=False)

In [115]:
test_df = pd.read_csv("resources/all_creek_levels.csv")

this_df = test_df[test_df['id'] == 'CGL01']

In [121]:
new_df = this_df.groupby('date').mean().reset_index(drop=False)
new_df['id'] = 'id'

In [122]:
new_df

Unnamed: 0,date,elevation,id
0,10/1/2017,944.014792,id
1,10/1/2018,944.260312,id
2,10/10/2017,944.720625,id
3,10/10/2018,944.364896,id
4,10/11/2017,944.641562,id
5,10/11/2018,944.405208,id
6,10/12/2017,944.564375,id
7,10/12/2018,944.394792,id
8,10/13/2017,944.494792,id
9,10/13/2018,944.369583,id
