# load_kingcounty

Extracts, transforms, and loads various King County marine observations.

* [CTD Data](https://green2.kingcounty.gov/marine/Download)
* Lab data: available by request from King County
* [Mooring Data](https://green2.kingcounty.gov/marine-buoy/Data.aspx) (not implemented yet)
* stations file: The CTD download page makes a POST HTTP request to https://green2.kingcounty.gov/marine/Download/GetStations. The file here is the response JSON object from that request, and has been included in the repository.

In [1]:
kingcounty_ctd_files = "data/kingcounty/ctd/*.csv"
kingcounty_nut_files = "data/kingcounty/nutrients/*.xlsx"
stations_file = "data/kingcounty/stations.json"

import glob
import json
import uuid
from multiprocessing import Pool
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import db

Start by parsing the stations.json file

In [2]:
# The lat/longs in this JSON file appear to be encoded as DDDMMSS.SSS with no cardinality.
# This takes a custom function to convert.
def to_decdeg(raw):
    degs = (raw / 1e4).astype(int)
    mins = ((raw - degs * 1e4) / 100).astype(int)
    secs = raw - degs * 1e4 - mins * 100
    return degs + mins / 60 + secs / 3600

with open(stations_file) as f:
    station_data = json.load(f)
station_df = pd.DataFrame(station_data)
# Make a GeoDataFrame for the stations
station_gdf = gpd.GeoDataFrame(
    station_df.drop(['lat','long'],axis=1),
    crs='epsg:6318',
    geometry=[Point(xy) for xy in zip(-to_decdeg(station_df.long), to_decdeg(station_df.lat))])
station_gdf.head()

Unnamed: 0,Locator,stationName,stationDesc,depth,sediment,offshore,ctd,ysi,beach,shellfish,active,Data,geometry
0,Adm Inlet-1,Admiralty Inlet Transect,Admiralty Inlet Transect Station 1,117.5,False,False,True,False,False,False,False,[],POINT (-122.51034 47.91255)
1,Adm Inlet-2,Admiralty Inlet Transect,Admiralty Inlet Transect Station 2,193.0,False,False,True,False,False,False,False,[],POINT (-122.49484 47.91435)
2,Adm Inlet-3,Admiralty Inlet Transect,Admiralty Inlet Transect Station 3,198.5,False,False,True,False,False,False,False,[],POINT (-122.48089 47.91586)
3,Adm Inlet-4-C14,Admiralty Inlet,Admiralty Inlet Transect Station 4-C-14,131.5,False,True,True,False,False,False,False,[],POINT (-122.46619 47.91741)
4,Adm Inlet-5,Admiralty Inlet Transect,Admiralty Inlet Transect Station 5,60.0,False,False,True,False,False,False,False,[],POINT (-122.45183 47.91915)


Clean up the stations GeoDataFrame to match the DB schema

In [3]:
station_gdf = station_gdf[['Locator','stationDesc','geometry']].rename(columns={
    "Locator": "name",
    "stationDesc": "description"
}).rename_geometry('geom').to_crs(epsg=32610).set_index('name')
station_gdf.head()

Unnamed: 0_level_0,description,geom
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Adm Inlet-1,Admiralty Inlet Transect Station 1,POINT (536587.743 5306696.929)
Adm Inlet-2,Admiralty Inlet Transect Station 2,POINT (537744.617 5306903.832)
Adm Inlet-3,Admiralty Inlet Transect Station 3,POINT (538785.814 5307078.388)
Adm Inlet-4-C14,Admiralty Inlet Transect Station 4-C-14,POINT (539883.574 5307258.464)
Adm Inlet-5,Admiralty Inlet Transect Station 5,POINT (540954.973 5307459.249)


Save any stations that aren't already in the DB table

In [4]:
engine = db.connect()
locations_existing = gpd.read_postgis('SELECT * FROM obsdata.stations', con=engine, index_col='name')
station_gdf = station_gdf.loc[~station_gdf.index.isin(locations_existing.index)]
if len(station_gdf) > 0:
    station_gdf.to_postgis('stations', con=engine, schema='obsdata', index=True, index_label='name', if_exists='append')
    print(f'Loaded {len(station_gdf)} locations')

    station_gdf = gpd.read_postgis('SELECT * FROM obsdata.stations', con=engine, index_col='name')
else:
    station_gdf = locations_existing
station_gdf.head()

Unnamed: 0_level_0,description,geom
name,Unnamed: 1_level_1,Unnamed: 2_level_1
RSR837,Strait of Georgia: Rosario Strait - Peapod Rock,POINT (517470.419 5384851.335)
CMB006,Commencement Bay - Mouth of City WW,POINT (542614.497 5234397.690)
CSE002,Case Inlet - Off Rocky Point,POINT (514099.002 5244447.120)
DYE004,Dyes Inlet - NE of Chico Bay,POINT (523419.029 5274483.975)
ELD001,Eld Inlet - Flapjack Point,POINT (503921.564 5217019.981)


Create a King County source if it doesn't already exist

In [5]:
df = pd.read_sql_table("sources", con=engine, schema='obsdata', index_col='id')
kc_source_row = df.loc[(df['agency'] == "King County") & (df['study'] == "Marine Monitoring")]
if len(kc_source_row) == 0:
    df = pd.DataFrame({
        "agency": ["King County"],
        "study": ["Marine Monitoring"]
    })
    df.to_sql('sources', con=engine, schema='obsdata', index=False, if_exists='append')

    # Refresh the sources so we can fetch the primary key
    df = pd.read_sql_table("sources", con=engine, schema='obsdata', index_col='id')
    kc_source_row = df.loc[(df['agency'] == "King County") & (df['study'] == "Marine Monitoring")]

kc_source_id = kc_source_row.index[0]
print(kc_source_id)
df

58


Unnamed: 0_level_0,agency,study
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,WA Ecology,MarineWater
3,Salish Cruise,TN322
4,Salish Cruise,CB1023
5,Salish Cruise,TN315
6,Salish Cruise,CB1050
7,Salish Cruise,TN105
8,Salish Cruise,RBTSN201805
9,Salish Cruise,TN333
10,Salish Cruise,TN079
11,Salish Cruise,RC0007


Read the downloaded CTD data

In [6]:
column_map = (
    ("temp","Sample_Temperature_field",1),
    ("o2","DO_field",1),
    ("chla","Chla_field",1),
    ("salt","Salinity_field",1),
    ("no23","NO23_field",1000/14.01)
)

# Per-thread database connection
def db_init():
    global thread_con
    thread_con = db.connect()

def extract_ctds(f):
    # Setting dtypes on the _Qual cols avoids DTypeWarnings
    df = pd.read_csv(f, skiprows=1, parse_dates=[1], encoding='utf-16', dtype={
        'ST_Qual': str, 'DN_Qual': str, 'DO_Qual': str, 'CH_Qual': str,
        'SA_Qual': str, 'LT_Qual': str, 'NO23_Qual': str
    })
    # Apply the timezone
    df["Sample_Date"] = df["Sample_Date"].dt.tz_localize('US/Pacific')

    # For each _Qual column, mask any data in the previous column if the _Qual value contains a bad QA flag (R or E)
    for i,isqual in enumerate(df.columns.str.endswith("_Qual")):
        if not isqual:
            continue
        data_col = df.columns[i - 1]
        qual_col = df.columns[i]
        df.loc[df[qual_col].str.contains("R") | df[qual_col].str.contains("E"), data_col] = np.nan
        
    df.rename(columns={
        "Locator": "location_id",
        "Sample_Date": "datetime",
        "Sample_Depth": "depth"
    }, inplace=True)
    
    # Eliminate any duplicate depth observations by taking the mean of all the
    # duplicated measurements
    df = df.groupby(['UpDown','location_id','datetime','depth']).mean().reset_index()

    # Cast detection so observations can be interpolated:
    # See https://stackoverflow.com/a/48975426
    df['cast_id'] = 1
    df.loc[:, 'cast_id'] = df.groupby(['location_id', pd.Grouper(key='datetime', freq='30min')])['cast_id'].transform(lambda g: uuid.uuid4())
    
    for dbparam,csvparam, convert in column_map:
        if csvparam not in df.columns:
            continue
        # For each data column, make a view that drops the NaNs, then append
        # that view's station, time, depth, and column value to
        # outs. Create a filled parameter_id that corresponds to the
        # data column and append it as well.
        view = df.loc[df['UpDown'] == 'Down'].dropna(subset=csvparam)

        processed_data = pd.DataFrame({
            'datetime': view['datetime'],
            'depth': view['depth'],
            'value': view[csvparam] * convert,
            'location_id': view['location_id'],
            'cast_id': view['cast_id']
        })
        processed_data['source_id'] = kc_source_id
        processed_data['parameter_id'] = dbparam
        # Remove any cast IDs for parameters that were measured fewer than 5 times
        # (these cannot be interpolated reliably)
        counts = processed_data[['cast_id','parameter_id','value']].groupby(['cast_id','parameter_id']).count()
        m = processed_data.merge(counts, how='left', left_on=('cast_id','parameter_id'), right_index=True)
        processed_data.loc[m['value_y'] < 5, 'cast_id'] = np.nan

        processed_data.to_sql('observations', con=thread_con, schema='obsdata', index=False, if_exists='append')

with Pool(initializer=db_init) as p:
    p.map(extract_ctds, glob.glob(kingcounty_ctd_files))

Now read the nutrient data

In [7]:
parameter_map = pd.DataFrame((
    ("sioh4","Silica",1000/28.09),
    ("nh4","Ammonia Nitrogen",1000/14.01),
    ("no23","Nitrite + Nitrate Nitrogen",1000/14.01),
    ("orthp","Orthophosphate Phosphorus", 1000/30.97)
), columns=("dbparam","kcparam","conv"))

def extract_nutrients(f):
    df = pd.read_excel(f, parse_dates=[1])
    # Apply the timezone
    df["Collect DateTime"] = df["Collect DateTime"].dt.tz_localize('US/Pacific')

    # Not all entries have times filled in on the dates. Need to apply a non-midnight times found to
    # the midnight times for matching dates.
    # From examining the data I've been given, it looks like there's usually at least one row with
    # an actual time for a given location and date. So group by location/date and find a non-midnight
    # time (using max()), merge that time back into the DataFrame, and substitute it for all rows that
    # have a midnight time.
    df['date'] = df["Collect DateTime"].dt.date
    date_times = df.groupby(["Locator","date"])["Collect DateTime"].max()
    df = df.merge(date_times, left_on=["Locator","date"], right_index=True)
    df.loc[df["Collect DateTime_x"].dt.hour == 0, "Collect DateTime_x"] = df["Collect DateTime_y"]

    # Eliminate data that does not pass QA
    qa_fails = ('<MDL','E','H','SH','R')
    # Assemble an indexer by splitting the qualifier comma-delimited list and expanding
    # it into multiple columns. Bitwise-AND the indexer with a negated isin() check of
    # each column to ensure that the column does not contain a QA failure flag.
    qa_data = df['Lab Qualifier'].astype(str)
    qa_vals = qa_data.str.split(',', expand=True)
    indexer = True
    for i,col in enumerate(qa_vals.columns):
        indexer &= ~qa_vals[i].isin(qa_fails)
    df = df.loc[indexer].dropna(subset='Value')

    # Merge with the parameter map to get the parameter_id
    df = df.merge(parameter_map, how='inner', left_on='Parameter', right_on='kcparam').rename(
        columns={'Collect DateTime_x': 'datetime', 'Depth (m)': 'depth',
                 'Locator': 'location_id', 'Value': 'value', 'dbparam': 'parameter_id' }
    )[['datetime','depth','location_id','value','parameter_id']]
    df['source_id'] = kc_source_id

    # Eliminate any duplicate depth observations by taking the mean of all the
    # duplicated measurements
    df = df.groupby(['location_id','datetime','depth','parameter_id']).mean().reset_index()

    # Prevent unique constraint violations from lab vs CTD NO23 values by adding a minute
    # to the dates.
    df['datetime'] += pd.to_timedelta(1, 'min')
    
    # Perform unit conversions
    for i,row in parameter_map.iterrows():
        df.loc[df['parameter_id'] == row['dbparam'], 'value'] *= row['conv']

    df.to_sql('observations', con=thread_con, schema='obsdata', index=False, if_exists='append')

thread_con = engine
for f in glob.glob(kingcounty_nut_files):
    extract_nutrients(f)