* `nwem_files`: A pattern matching each Matlab file of raw data from the NWEM FTP site
* `nwem_stations`: This structure is copy-pasted from a JS file loaded for the map on https://nwem.apl.washington.edu/prod_PugetSound.shtml, then IDs were manually inserted to match the location prefixes on the Matlab file names

In [1]:
nwem_files = "data/NWEM/*.mat"
nwem_stations = [
    ['HC_TW','<div class="map-window"><h5>Twanoh</h5><img src="http://orcabase.ocean.washington.edu/lynchcove_orca.png"><br>47.375 N, -123.0083 W<br>35 meters depth<br><a href="prod_PS_Twanoh.shtml">&raquo; view data</a></div>', 47.375, -123.0083 , 'Twanoh'],
    ['HC_HP','<div class="map-window"><h5>Hoodsport</h5><img src="http://orcabase.ocean.washington.edu/hoodsport_orca.jpg"><br>47.4218 N, -123.1126 W<br>120 meters depth<br><a href="prod_PS_Hoodsport.shtml">&raquo; view data</a></div>', 47.4218, -123.1126, 'Hoodsport'],
    ['HC_DB','<div class="map-window"><h5>Dabob Bay</h5><img src="http://orcabase.ocean.washington.edu/lynchcove_orca.png"><br>47.8034 N, -122.8029 W<br>100 meters depth<br><a href="prod_PS_Dabob.shtml">&raquo; view data</a></div>', 47.8034, -122.8029, 'Dabob Bay'],
    ['HC_NB','<div class="map-window"><h5>Hansville</h5><img src="http://orcabase.ocean.washington.edu/northbuoy_orca.png"><br>47.907 N, -122.627 W<br>100 meters depth<br><a href="prod_PS_NorthB.shtml">&raquo; view data</a></div>', 47.907, -122.627, 'Hansville'],
    ['PW','<div class="map-window"><h5>Point Wells</h5><img src="http://orcabase.ocean.washington.edu/northbuoy_orca.png"><br>47.761 N, -122.3972 W<br>100 meters depth<br><a href="prod_PS_PWells.shtml">&raquo; view data</a></div>', 47.761167, -122.397167, 'Point Wells'],
    ['CI','<div class="map-window"><h5>Carr Inlet</h5><img src="http://orcabase.ocean.washington.edu/northbuoy_orca.png"><br>47.28 N, -122.728 W<br>47 meters depth<br><a href="prod_PS_Carr.shtml">&raquo; view data</a></div>', 47.27965, -122.72763, 'Carr Inlet']
]
# I don't know where to get the ChaBa data at this time
#    [,'<div class="map-window"><h5>NEMO - Ćháʔba·</h5>47&deg; 57.997\' N, 124&deg; 56.987\' W<br>100 meters depth<br><a href="prod_CS_ChaBa.shtml">&raquo; view data</a></div>', 47.9666167, -124.9497833, 'ChaBa'],
# Bellingham Bay data is in a different format on the data portal that I haven't parsed yet
#    ['<div class="map-window"><h5>NEMO - Sub-Surface/ESP</h5>47&deg; 58\' N, 124&deg; 57\' W<br>100 meters depth<br><a href="prod_CS_SubSurface.shtml">&raquo; view data</a></div>', 47.9766667, -124.9500000, 'SubSurf'],
#    ['<div class="map-window"><h5>Bellingham Bay</h5>48&deg; N, 122&deg; W<br>100 meters depth<br><a href="prod_BB_BellBay.shtml">&raquo; view data</a></div>', 48.737601, -122.563964, 'BellBay']

import glob
import json
import uuid
import os
from scipy.io import loadmat
from multiprocessing import Pool
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import db

In [2]:
station_df = pd.DataFrame(nwem_stations, columns=('name','html','lat','lon','description')).drop(columns='html')
station_df['description'] += ' ORCA Buoy'
# Make a GeoDataFrame for the stations
station_gdf = gpd.GeoDataFrame(
    station_df.drop(['lat','lon'],axis=1),
    crs='epsg:6318',
    geometry=[Point(xy) for xy in zip(station_df.lon, station_df.lat)]
).rename_geometry('geom').set_index('name').to_crs(epsg=32610)
station_gdf

Unnamed: 0_level_0,description,geom
name,Unnamed: 1_level_1,Unnamed: 2_level_1
HC_TW,Twanoh ORCA Buoy,POINT (499373.416 5246837.904)
HC_HP,Hoodsport ORCA Buoy,POINT (491507.124 5252045.093)
HC_DB,Dabob Bay ORCA Buoy,POINT (514758.545 5294468.133)
HC_NB,Hansville ORCA Buoy,POINT (527874.074 5306031.073)
PW,Point Wells ORCA Buoy,POINT (545175.714 5289931.435)
CI,Carr Inlet ORCA Buoy,POINT (520598.789 5236277.333)


In [3]:
engine = db.connect()
locations_existing = gpd.read_postgis('SELECT * FROM obsdata.stations', con=engine, index_col='name')
station_gdf = station_gdf.loc[~station_gdf.index.isin(locations_existing.index)]
if len(station_gdf) > 0:
    station_gdf.to_postgis('stations', con=engine, schema='obsdata', index=True, index_label='name', if_exists='append')
    print(f'Loaded {len(station_gdf)} locations')

    station_gdf = gpd.read_postgis('SELECT * FROM obsdata.stations', con=engine, index_col='name')
else:
    station_gdf = locations_existing
station_gdf.head()

Unnamed: 0_level_0,description,geom
name,Unnamed: 1_level_1,Unnamed: 2_level_1
RSR837,Strait of Georgia: Rosario Strait - Peapod Rock,POINT (517470.419 5384851.335)
CMB006,Commencement Bay - Mouth of City WW,POINT (542614.497 5234397.690)
CSE002,Case Inlet - Off Rocky Point,POINT (514099.002 5244447.120)
DYE004,Dyes Inlet - NE of Chico Bay,POINT (523419.029 5274483.975)
ELD001,Eld Inlet - Flapjack Point,POINT (503921.564 5217019.981)


In [4]:
df = pd.read_sql_table("sources", con=engine, schema='obsdata', index_col='id')
nwem_source_row = df.loc[(df['agency'] == "UW NWEM") & (df['study'] == "ORCA Moorings")]
if len(nwem_source_row) == 0:
    df = pd.DataFrame({
        "agency": ["UW NWEM"],
        "study": ["ORCA Moorings"]
    })
    df.to_sql('sources', con=engine, schema='obsdata', index=False, if_exists='append')

    # Refresh the sources so we can fetch the primary key
    df = pd.read_sql_table("sources", con=engine, schema='obsdata', index_col='id')
    nwem_source_row = df.loc[(df['agency'] == "UW NWEM") & (df['study'] == "ORCA Moorings")]

nwem_source_id = nwem_source_row.index[0]
print(nwem_source_id)
df.head()

224


Unnamed: 0_level_0,agency,study
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,WA Ecology,MarineWater
224,UW NWEM,ORCA Moorings
169,Salish Cruise,TN322
58,King County,Marine Monitoring
170,Salish Cruise,CB1023


In [5]:
# Per-thread database connection
def db_init():
    global thread_con
    thread_con = db.connect()

In [6]:
def extract_nwem(f):
    base = os.path.basename(f)
    # Parse the buoy ID from the filename
    location_id = np.vectorize(lambda s: base.startswith(s + '_'))(
        station_gdf.index.values).nonzero()[0]
    if len(location_id) != 1:
        print(f'Could not find location for file {f}')
        return
    location_id = station_gdf.index[location_id[0]]

    if not base.startswith(f'{location_id}_CTD'):
        # Skip this file, it's a different format (like pH)
        return

    matdata = loadmat(f)
    # There's a weird cast in the Dabob Bay file that has identical date/time
    # to a later one. I'm removing the earlier one by nan'ing the dates (all
    # the other data gets dropped with dropna() later).
    if base == 'HC_DB_CTD_data_bin_web.mat':
        matdata['Btime'][:,1333] = np.nan
    # Arrays are two-dimensional, with axis 0 corresponding to pressure bins
    # and axis 1 corresponding to cast.
    shape = matdata['Btime'].shape
    days = matdata['Btime'].flatten('F')
    df = pd.DataFrame({
        "datetime": pd.Timestamp('1999-12-31').tz_localize(-8*3600) + pd.to_timedelta(days, unit='D'),
        "depth": matdata['Bdepth'].flatten('F')
    })
    df['location_id'] = location_id
    df['source_id'] = nwem_source_id
    # Assign cast id's based on source array shape. Generate one UUID per cast,
    # then repeat it for the number of pressure levels
    df['cast_id'] = np.repeat([uuid.uuid4() for c in range(shape[1])], shape[0])
    data = {
        'temp': matdata['Btemp'].flatten('F'),
        'salt': matdata['Bsal'].flatten('F'),
        'o2': matdata['Boxy_mgL_cal'].flatten('F'),
        'chla': matdata['Bfluor_cal'].flatten('F'),
        'no3': matdata['Bnitrate_cal'].flatten('F')
    }

    for p,d in data.items():
        df['value'] = d
        df['parameter_id'] = p
        pruned = df.dropna()
        pruned.to_sql('observations', con=thread_con, schema='obsdata', index=False, if_exists='append')

with Pool(initializer=db_init) as p:
    p.map(extract_nwem, glob.glob(nwem_files))

Unnamed: 0,datetime,depth,location_id,source_id,cast_id
0,2010-04-12 12:15:52.000041599-08:00,1.175078,PW,224,1d41442d-84e5-42c3-af11-dfbdd70c2445
1,2010-04-12 12:16:06.749990400-08:00,2.46825,PW,224,1d41442d-84e5-42c3-af11-dfbdd70c2445
2,2010-04-12 12:16:11.000006400-08:00,3.449111,PW,224,1d41442d-84e5-42c3-af11-dfbdd70c2445
3,2010-04-12 12:16:15.499977599-08:00,4.515778,PW,224,1d41442d-84e5-42c3-af11-dfbdd70c2445
4,2010-04-12 12:16:19.749993600-08:00,5.467125,PW,224,1d41442d-84e5-42c3-af11-dfbdd70c2445
