# load_ecology_ctd

This notebook extracts observation data for temperature, salinity, dissolved oxygen, pH, and chlorophyll fluorescence from Washington Department of Ecology CTD data.

To acquire the data, visit [EIM](https://apps.ecology.wa.gov/eim/search/Eim/EIMSearch.aspx?) and search for the study ID "MarineWater". Restrict the date range of the search to a few years at a time, as I found that my download requests were less likely to be successful once the number of records requested at a time goes over about 2 million.

Each set of results can be downloaded by submitting a download request. If successful, within a few hours, you are emailed a link to download a .zip file containing several .csv files. Put all the downloaded .zip files from every request in the directory `data/ecology/ctd` and this notebook should handle the rest.

In [1]:
ecology_ctd_files = "data/ecology/ctd/*.zip"

import glob
import zipfile
import re
import uuid
from multiprocessing import Pool
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import db

First, read all the location details from all the zip files.

In [2]:
loc_re = re.compile("^EIMLocationDetails_.*\\.csv$")
ctd_re = re.compile("^EIMContinuousDepthSeriesData_.*\\.csv$")

location_gdfs = []
for f in glob.glob(ecology_ctd_files):
    with zipfile.ZipFile(f) as zf:
        files = zf.namelist()
        # Find the file in this zip that matches the location filename pattern
        # See https://stackoverflow.com/a/19502692
        loc_file = next(filter(lambda s: s if loc_re.match(s) else None, files))
        with zf.open(loc_file) as f:
            # See https://anitagraser.com/2019/01/23/from-csv-to-geodataframe-in-two-lines/
            df = pd.read_csv(f, usecols=['Location_ID','Location_Description',
                                         'Calculated_Latitude_Decimal_Degrees_NAD83HARN',
                                         'Calculated_Longitude_Decimal_Degrees_NAD83HARN'])
            df.columns = ['name','description','y','x']
            gdf = gpd.GeoDataFrame(
                df.drop(['x','y'],axis=1),
                crs='epsg:6318',
                geometry=[Point(xy) for xy in zip(df.x, df.y)])
            location_gdfs.append(gdf)
locations = gpd.GeoDataFrame(pd.concat(location_gdfs, ignore_index=True).drop_duplicates(subset=['name'])
                            ).set_index('name').rename_geometry('geom').to_crs(epsg=32610)
locations.head()

Unnamed: 0_level_0,description,geom
name,Unnamed: 1_level_1,Unnamed: 2_level_1
RSR837,Strait of Georgia: Rosario Strait - Peapod Rock,POINT (517470.419 5384851.335)
CMB006,Commencement Bay - Mouth of City WW,POINT (542614.497 5234397.690)
CSE002,Case Inlet - Off Rocky Point,POINT (514099.002 5244447.120)
DYE004,Dyes Inlet - NE of Chico Bay,POINT (523419.029 5274483.975)
ELD001,Eld Inlet - Flapjack Point,POINT (503921.564 5217019.981)


Create an Ecology source if it doesn't already exist

In [3]:
engine = db.connect()
df = pd.read_sql_table("sources", con=engine, schema='obsdata', index_col='id')
ecology_source_row = df.loc[(df['agency'] == "WA Ecology") & (df['study'] == "MarineWater")]
if len(ecology_source_row) == 0:
    df = pd.DataFrame({
        "agency": ["WA Ecology"],
        "study": ["MarineWater"]
    })
    df.to_sql('sources', con=engine, schema='obsdata', index=False, if_exists='append')

    # Refresh the sources so we can fetch the primary key
    df = pd.read_sql_table("sources", con=engine, schema='obsdata', index_col='id')
    ecology_source_row = df.loc[(df['agency'] == "WA Ecology") & (df['study'] == "MarineWater")]

ecology_source_id = ecology_source_row.index[0]
print(ecology_source_id)
df

1


Unnamed: 0_level_0,agency,study
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,WA Ecology,MarineWater


Save any new locations to the DB, then load the entire stations table to ensure we have everything

In [4]:
locations_existing = gpd.read_postgis('SELECT * FROM obsdata.stations', con=engine, index_col='name')
locations = locations.loc[~locations.index.isin(locations_existing.index)]
if len(locations) > 0:
    locations.to_postgis('stations', con=engine, schema='obsdata', index=True, index_label='name', if_exists='append')
    print(f'Loaded {len(locations)} locations')

    locations = gpd.read_postgis('SELECT * FROM obsdata.stations', con=engine, index_col='name')
else:
    locations = locations_existing
locations.head()

Unnamed: 0_level_0,description,geom
name,Unnamed: 1_level_1,Unnamed: 2_level_1
RSR837,Strait of Georgia: Rosario Strait - Peapod Rock,POINT (517470.419 5384851.335)
CMB006,Commencement Bay - Mouth of City WW,POINT (542614.497 5234397.690)
CSE002,Case Inlet - Off Rocky Point,POINT (514099.002 5244447.120)
DYE004,Dyes Inlet - NE of Chico Bay,POINT (523419.029 5274483.975)
ELD001,Eld Inlet - Flapjack Point,POINT (503921.564 5217019.981)


Read the WQ parameters table

In [5]:
parameters = pd.read_sql_table('parameters', con=engine, schema='obsdata', index_col='key')
parameters

Unnamed: 0_level_0,name,unit
key,Unnamed: 1_level_1,Unnamed: 2_level_1
temp,Temperature,deg_c
salt,Salinity,psu
o2,Dissolved Oxygen,mgl
nh4,Ammonia,umol
no3,Nitrate,umol
no2,Nitrite,umol
no23,Nitrate+Nitrite,umol
chla,Chlorophyll-A,ugl
ph,pH,ph
po4,"Phosphate, dissolved",umol


Create a "mapping" dataframe between the Ecology names for water quality parameters, and our database's keys for parameter ID

In [6]:
parameter_map = {
    "Temperature": "Temperature, water",
    "Salinity": "Salinity",
    "Dissolved Oxygen": "Dissolved Oxygen",
    "pH": "pH",
    "Chlorophyll-A": "Fluorescence"
}
for p, pname in parameter_map.items():
    parameters.loc[parameters['name'] == p, 'name'] = pname
parameters

Unnamed: 0_level_0,name,unit
key,Unnamed: 1_level_1,Unnamed: 2_level_1
temp,"Temperature, water",deg_c
salt,Salinity,psu
o2,Dissolved Oxygen,mgl
nh4,Ammonia,umol
no3,Nitrate,umol
no2,Nitrite,umol
no23,Nitrate+Nitrite,umol
chla,Fluorescence,ugl
ph,pH,ph
po4,"Phosphate, dissolved",umol


Finally, do the work of extracting data from each CTD. Spawn worker threads to speed up processing, as the connection
latency to a remote database can significantly slow things down when processing millions of observations.

In [7]:
def db_init():
    global thread_con
    thread_con = db.connect()

def extract_ctds(f):
    with zipfile.ZipFile(f) as zf:
        files = zf.namelist()
        # See https://stackoverflow.com/a/19502692
        ctd_file = next(filter(lambda s: s if ctd_re.match(s) else None, files))
        with zf.open(ctd_file) as f:
            df = pd.read_csv(f, usecols=['Location_ID', 'Field_Collection_Date_Time', 'Depth_Value', 'Result_Parameter_Name',
                                         'Result_Value', 'Result_Value_Units', 'Result_Data_Qualifier'], parse_dates=[1])
            # Rename the columns
            df.columns = ['location_id', 'datetime', 'depth', 'parameter_name', 'value', 'unit', 'qa']
            # TODO process the qa column?
            # Set all collection date/times to UTC, as this is what Ecology uses
            df['datetime'] = df['datetime'].dt.tz_localize('UTC')
            # Merge the parameters DF so we can get the FK
            df = df.merge(parameters.reset_index(), left_on='parameter_name', right_on='name', copy=False)
            df['source_id'] = ecology_source_id
            # Clean up columns/names, and drop any NaNs
            df = df.rename(columns={'key': 'parameter_id'})
            df = df[['parameter_id','source_id','datetime','depth','value','location_id']].dropna()

            # Cast detection so observations can be interpolated:
            # See https://stackoverflow.com/a/48975426
            df['cast_id'] = 1
            df.loc[:, 'cast_id'] = df.groupby(['location_id', pd.Grouper(key='datetime', freq='30min')])['cast_id'].transform(lambda g: uuid.uuid4())
            # Remove any cast IDs for parameters that were measured fewer than 5 times
            # (these cannot be interpolated reliably)
            counts = df[['cast_id','parameter_id','value']].groupby(['cast_id','parameter_id']).count()
            m = df.merge(counts, how='left', left_on=('cast_id','parameter_id'), right_index=True)
            df.loc[m['value_y'] < 5, 'cast_id'] = np.nan

            # Save to the database
            df.to_sql('observations', con=thread_con, schema='obsdata', index=False, if_exists='append')

with Pool(initializer=db_init) as p:
    p.map(extract_ctds, glob.glob(ecology_ctd_files))