# load_cruises

This notebook extracts, transforms, and loads data from NANOOS-hosted Salish cruises (such as the UW PRISM cruises). Most of this data can be downloaded from http://nvs.nanoos.org/CruiseSalish.

Each downloaded file is a .zip file which contains at least one .csv file; typically, there is one downcast file and one file with both upcast and lab data. The format of this data changed around 2017, and this notebook is able to detect and handle both formats. Come of the cruises from the mid-2010s do not have the lab/bottle data included in the downloads on the website; I was able to request this missing data from NANOOS and it was supplied in NetCDF format. If you receive the same files, they can be placed in the directory `data/prism/netcdf/` and this notebook will process them to look for any data from cruises where it was unable to find lab data in the .csv's.

While there are common station names for most of the Salish cruises, they are not always in perfectly consistent locations. This may matter depending on your application of the data, so the locations are renamed with alphabetic letter suffixes (for instance, there may be P11, P11a, P11b, ..., P11z, P11aa, P11bb, ...) to ensure uniqueness even if the coordinates for these stations are very close together.

Each cruise is assigned to the database as an independent entry in the `sources` table.

In [1]:
zip_files = "data/prism/Salish_Cruise-*.zip"
bottle_cdfs = "data/prism/netcdf/*_bottle.nc"

import glob
import zipfile
import re
import uuid
import csv
import string
from io import TextIOWrapper
from multiprocessing import Pool
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from netCDF4 import Dataset
import db

Station location is interwoven into the CSVs, and its processing cannot be parallelized because there are so many (but not all) duplicates. Read through all the zip files to get the station names/locations and cruise IDs first

In [2]:
downcast_re = re.compile('SalishCruise_.*_downcast\\.csv$')

# A converter helper to strip whitespace from the ends of strings
stripper = lambda s: s.strip()

# Parse a latitude/longitude value given in degrees and minutes with a cardinal direction
# into a positive or negative decimal degree value
def to_decdeg(degmin):
    (deg, minute, plusminus) = [t(s) for t,s in zip((int, float, str), degmin.split())]
    return (deg + minute / 60) * (-1 if plusminus in ('S','W') else 1)

location_gdfs = []
source_dfs = []
for f in glob.glob(zip_files):
    with zipfile.ZipFile(f) as zf:
        files = zf.namelist()
        # Find the file in this zip that matches the downcast filename pattern
        # See https://stackoverflow.com/a/19502692
        downcast_file = next(filter(lambda s: s if downcast_re.search(s) else None, files))
        with zf.open(downcast_file) as f:
            # See https://anitagraser.com/2019/01/23/from-csv-to-geodataframe-in-two-lines/
            try:
                df = pd.read_csv(f, skiprows = lambda x: x in [1,2], skipinitialspace=True,
                                 encoding="ISO-8859-1", converters={ 'Cruise ID': stripper, 'Station': stripper },
                                 usecols=['Cruise ID','Station','Latitude Deg','Longitude Deg'], na_values='None')
                df = df.loc[df['Station'] != 'None']
                df.columns = ['cruise','y','x','name']
            except ValueError:
                f.seek(0)
                # New format from 2018 onwards. Pandas has a hard time with the header so ignore
                # it
                df = pd.read_csv(f, skiprows=2, sep=',', encoding="ISO-8859-1", index_col=False, na_values='None',
                                 header=0, names=['Upload time','lat', 'lon', 'time', 'station','cruise'],
                                 usecols=['station','lat','lon','cruise'],
                                 converters={ 'lat': to_decdeg, 'lon': to_decdeg, 'station': stripper, 'cruise': stripper })
                df.columns = ['y','x','name','cruise']
            df.dropna(subset='name', inplace=True)
            # Make a GeoDataFrame for the stations
            gdf = gpd.GeoDataFrame(
                df.drop(['cruise','x','y'],axis=1),
                crs='epsg:6318',
                geometry=[Point(xy) for xy in zip(df.x, df.y)])
            location_gdfs.append(gdf)
            source_dfs.append(pd.DataFrame(df['cruise'].drop_duplicates()))
# Preserve the different locations across the cruises. Drop duplicates on geometry, then append letter(s)
# to the duplicate names so they're unique. The first one doesn't get any suffix, then the next 26 get a-z, then
# the next 26 get aa-zz
locations = gpd.GeoDataFrame(pd.concat(location_gdfs, ignore_index=True).drop_duplicates(subset=['geometry']))
full_seq = ([""] + list(string.ascii_lowercase) + (
             pd.Series(list(string.ascii_lowercase)) + pd.Series(list(string.ascii_lowercase))
            ).tolist())
for n,group in locations.groupby("name"):
    locations.loc[locations["name"] == n, "name"] += full_seq[:len(group)]
locations = locations.set_index('name').rename_geometry('geom').to_crs(epsg=32610)
locations.head()

Unnamed: 0_level_0,geom
name,Unnamed: 1_level_1
P1,POINT (551839.304 5318404.955)
P28,POINT (540908.380 5283990.478)
P3,POINT (537914.065 5328503.637)
P4,POINT (533200.284 5343294.734)
P5,POINT (547312.002 5303619.781)


Assemble a DataFrame of all the cruise sources

In [3]:
sources = pd.concat(source_dfs, ignore_index=True).drop_duplicates()
# Rename the column
sources.columns = ['study']
# Add an "agency" column for all the cruises
sources['agency'] = "Salish Cruise"
sources.head()

Unnamed: 0,study,agency
0,RC0007,Salish Cruise
1,AQ201610,Salish Cruise
2,CB1019,Salish Cruise
3,TN087,Salish Cruise
4,AQ201710,Salish Cruise


Save the cruise sources and locations

In [5]:
engine = db.connect()
locations.to_postgis('stations', con=engine, schema='obsdata', index=True, index_label='name', if_exists='append')
sources.to_sql('sources', con=engine, schema='obsdata', index=False, if_exists='append')
# Re-fetch sources so we have the right foreign key values for observations
sources = pd.read_sql("SELECT * FROM obsdata.sources WHERE agency='Salish Cruise'", con=engine, index_col='id')
sources.head()

Unnamed: 0_level_0,agency,study
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Salish Cruise,RC0007
3,Salish Cruise,AQ201610
4,Salish Cruise,CB1019
5,Salish Cruise,TN087
6,Salish Cruise,AQ201710


Process all the .zip files containing downcast or lab data.

In [7]:
labupcast_csv_re = re.compile('SalishCruise_.*_labupcast\\.csv$')
labupcast_excel_re = re.compile('SalishCruise_.*_labupcast\\.xlsx$')

# Per-thread database connection
def db_init():
    global thread_con
    thread_con = db.connect()

# Iterate over all the zip files
def process_file(zip_filename):
    bottle_found = False
    with zipfile.ZipFile(zip_filename) as zf:
        files = zf.namelist()
        # Process the downcast data
        # Find the file in this zip that matches the downcast filename pattern
        # See https://stackoverflow.com/a/19502692
        downcast_file = next(filter(lambda s: s if downcast_re.search(s) else None, files))
        with zf.open(downcast_file) as f:
            # Read the header with a csv.reader to figure out which format it is
            rdr = csv.reader(TextIOWrapper(f, encoding="ISO-8859-1"))
            header = next(rdr)
            f.seek(0)
            if "Cruise ID" in header:
                df = pd.read_csv(f, skiprows = lambda x: x in [1,2], skipinitialspace=True,
                                 encoding="ISO-8859-1", parse_dates=['UTC Time'],
                                 converters={ 'Cruise ID': stripper, 'Station': stripper })
                df = df.loc[df['Station'] != 'None']
                df = df.rename(columns={
                    'Cruise ID': 'cruise',
                    'UTC Time': 'time',
                    'Station': 'station',
                    'Depth': 'depth',
                    'Temperature': 'temp',
                    'Salinity': 'salt',
                    'Oxygen Concentration MG': 'o2',
                    'Chlorophyll Fluorescence ': 'chla'
                })
            else:
                # New format from ~2017 onwards
                df = pd.read_csv(f, skiprows=lambda r: r == 1, sep=',', encoding="ISO-8859-1", index_col=False,
                                 parse_dates=['NMEAtimeUTC'], skipinitialspace=True,
                                 converters={ 'Station': stripper, 'CruiseID': stripper })
                df = df.rename(columns={
                    'NMEAtimeUTC': 'time',
                    'Station': 'station',
                    'CruiseID': 'cruise',
                    'Cast': 'cast_id',
                    'flECO-AFL: Fluorescence  WET Labs ECO-AFL/FL ': 'chla',
                    'ph: pH': 'ph',
                    't090C: Temperature ': 'temp',
                    'depSM: Depth ': 'depth',
                    'sbeox0Mg/L: Oxygen  SBE 43 ': 'o2',
                    'sal00: Salinity  Practical ': 'salt'
                })
                    
            df.dropna(subset='station', inplace=True)

            # Assign cast UUID's
            if 'cast_id' in df.columns:
                # We already have cast identifiers, replace them with UUID's
                df.loc[:, 'cast_id'] = df.groupby(['station', 'cast_id'])['cast_id'].transform(lambda g: uuid.uuid4())
            else:
                # No casts have been defined, so estimate with Grouper
                df['cast_id'] = 1
                df.loc[:, 'cast_id'] = df.groupby(['station', pd.Grouper(key='time', freq='30min')])['cast_id'].transform(lambda g: uuid.uuid4())

            # Eliminate any duplicate depth observations by taking the mean of all the
            # duplicated measurements
            df = df.groupby(['station','cruise','time','cast_id','depth']).mean(numeric_only=True).reset_index()

            # Set timezone
            df['time'].dt = df['time'].dt.tz_localize('UTC')

            # Assign source IDs
            df = df.merge(sources.reset_index(), left_on='cruise', right_on='study', copy=False)

            # Each spreadsheet has different column names depending on the type of
            # data that was given.
            for param in ['temp','salt','o2','chla','ph']:
                if param not in df.columns:
                    continue
                # For each data column, make a view that drops the NaNs, then append
                # that view's station, time, depth, cast, and column value to
                # outs. Create a filled parameter_id that corresponds to the
                # data column and append it as well.
                view = df.dropna(subset=param)
                
                processed_data = pd.DataFrame({
                    'source_id': view['id'],
                    'datetime': view['time'],
                    'depth': view['depth'],
                    'value': view[param],
                    'location_id': view['station'],
                    'cast_id': view['cast_id']
                })
                processed_data['parameter_id'] = param
                # Remove any cast IDs for parameters that were measured fewer than 5 times
                # (these cannot be interpolated reliably)
                counts = processed_data[['cast_id','parameter_id','value']].groupby(['cast_id','parameter_id']).count()
                m = processed_data.merge(counts, how='left', left_on=('cast_id','parameter_id'), right_index=True)
                processed_data.loc[m['value_y'] < 5, 'cast_id'] = np.nan

                processed_data.to_sql('observations', con=thread_con, schema='obsdata',
                                      index=False, if_exists='append')

        # If there is a lab/upcast file, extract the lab data
        found_format = None
        for fmt,rxp in (('csv', labupcast_csv_re), ('excel', labupcast_excel_re)):
            # Very python-ic way to see if there's a matching file for the given
            # regexp. Stop after the first match since there isn't supposed to be
            # more than one
            for f in filter(lambda s: s if rxp.search(s) else None, files):
                labupcast_file = f
                found_format = fmt
                break
        if found_format is not None:
            bottle_found = True
            with zf.open(labupcast_file) as f:
                if found_format == 'csv':
                    lu_df = pd.read_csv(f, skiprows = lambda x: x in [1,2], skipinitialspace=True,
                                        encoding="ISO-8859-1", parse_dates=['UTC Time'],
                                        converters={ 'Cruise ID': stripper, 'Station': stripper })
                    lu_df = lu_df.loc[lu_df['Station'] != 'None']
                    lu_df = lu_df.rename(columns={
                        'Cruise ID': 'cruise',
                        'UTC Time': 'time',
                        'Station': 'station',
                        'Depth': 'depth',
                        'Oxygen Concentration MG Titration': 'o2',
                        'Chlorophyll Concetration': 'chla',
                        'Nitrate': 'no3',
                        'Nitrite': 'no2',
                        'Ammonium': 'nh4',
                        'Phosphate': 'po4',
                        'Silicate': 'sioh4'
                    })
                    # Set timezone
                    lu_df['time'].dt = lu_df['time'].dt.tz_localize('UTC')
                else:
                    lu_df = pd.read_excel(f)
                    # Merge date and time into a single column
                    # At least one of the files has invalid UTC time values because an offset was
                    # naively added to produce "times" that look like 24:56:00. So work from the
                    # local time columns instead
                    lu_df['time'] = pd.to_datetime(lu_df["DATE_LOCAL"].astype(str) + " " + lu_df["TIME_LOCAL"].astype(str))
                    lu_df['time'] = lu_df['time'].dt.tz_localize("US/Pacific")
                    lu_df['time'] = lu_df['time'].dt.tz_convert('UTC')

                    # Put stations in consistent format
                    lu_df['station'] = "P" + lu_df['STATION_NO'].astype(str)
                    lu_df = lu_df.rename(columns={
                        'CRUISE_ID': 'cruise',
                        'DEPTH (M)': 'depth',
                        'OXYGEN_avg_mg_L': 'o2',
                        'NITRATE_UMOL_L': 'no3',
                        'NITRITE_UMOL_L': 'no2',
                        'AMMONIUM_UMOL_L': 'nh4',
                        'PHOSPHATE_UMOL_L': 'po4',
                        'SILICATE_UMOL_L': 'sioh4'
                    })
                    # set o2 values to NaN if OXYGEN_FLAG_W is not 2
                    lu_df.loc[lu_df['OXYGEN_FLAG_W'] != 2, 'o2'] = np.nan
                    # set all nutrient values to NaN if NUTRIENTS_FLAG_W is not 2
                    lu_df.loc[lu_df['NUTRIENTS_FLAG_W'] != 2, ['no3','no2','nh4','po4','sioh4']] = np.nan

                # Eliminate any duplicate depth observations by taking the mean of all the
                # duplicated measurements
                lu_df = lu_df.groupby(['station','cruise','time','depth']).mean(numeric_only=True).reset_index()

                # Prevent unique constraint violations from lab vs CTD oxygen/chla values by adding a minute
                # to the dates. At least some labupcast files use identical collection times for the bottles
                # as for the downcast.
                lu_df['time'] += pd.to_timedelta(1, 'min')
                
                # Assign source IDs
                lu_df = lu_df.merge(sources.reset_index(), left_on='cruise', right_on='study', copy=False)

                # Each spreadsheet has different column names depending on the type of
                # data that was given.
                for param in ['o2','chla','no3','no2','nh4','po4','sioh4']:
                    if param not in lu_df.columns:
                        continue
                    # For each data column, make a view that drops the NaNs, then append
                    # that view's station, time, depth, and column value to
                    # outs. Create a filled parameter_id that corresponds to the
                    # data column and append it as well.
                    view = lu_df.dropna(subset=param)

                    processed_data = pd.DataFrame({
                        'source_id': view['id'],
                        'datetime': view['time'],
                        'depth': view['depth'],
                        'value': view[param],
                        'location_id': view['station']
                    })
                    processed_data['parameter_id'] = param
                    processed_data.to_sql('observations', con=thread_con, schema='obsdata',
                                          index=False, if_exists='append')
        # If there is no lab/upcast file, we'll take care of it when reading the netcdf files
    return (df['cruise'][0], bottle_found)

with Pool(initializer=db_init) as p:
    results = p.map(process_file, glob.glob(zip_files))
cruises, bottles = zip(*results)
results_series = pd.Series(bottles, cruises)
results_series.head()

RC0007       True
AQ201610    False
CB1019      False
TN087        True
AQ201710     True
dtype: bool

Assemble a list of all the cruise IDs for which the task threads did not find bottle data

In [8]:
missing_bottle_cruises = results_series.loc[~results_series].index
missing_bottle_cruises

Index(['AQ201610', 'CB1019', 'CB1023', 'TN315', 'TN301', 'TN296', 'CB1050',
       'CB1041', 'CB1045', 'TN322', 'CB1075', 'MV1403', 'RBTSN2017', 'CB1065',
       'TN281', 'SH1604', 'CB1028', 'CAB771', 'CB1041', 'CB1034', 'TN333'],
      dtype='object')

Process NetCDF files which match the cruise IDs

In [9]:
def cdftime2pdtime(times):
    # Convert the times in days since year 0 to pandas Timestamps. This requires some creativity to
    # avoid an OverflowError
    return pd.Timestamp('1/1/2000') + pd.to_timedelta(times - 730486, unit='D')

# Now process NetCDF bottle data for missing years
def process_cdf(cdf):
    ds = Dataset(cdf, "r")
    cruiseid = ds.cruise.strip()
    if cruiseid not in missing_bottle_cruises:
        # We already have this cruise
        return
    # Stations are integers. Leave them be for now...
    data = pd.DataFrame({
        "location_id": ds['station'][:],
        "datetime": cdftime2pdtime(ds['time_utc'][:]),
        "depth": ds['depth'][:],
        "o2": ds['oxygen_bottle'][:],
        "nh4": ds['ammonium_bottle'][:],
        "no3": ds['nitrate_bottle'][:],
        "no2": ds['nitrite_bottle'][:],
        "po4": ds['phosphate_bottle'][:],
        "sioh4": ds['silicate_bottle'][:],
        "chla": ds['chlorophyll_bottle'][:]
    }).dropna(thresh=4)
    ds.close()
    # drop data with invalid station numbers
    data = data.loc[data['location_id'] > 0]
    # Now we can fix the station names
    data["location_id"] = "P" + data["location_id"].astype(str)
    # Add source ID for the cruise
    data['source_id'] = sources.loc[sources['study'] == cruiseid].index[0]
    # Prevent unique constraint violations from lab vs CTD oxygen/chla values by adding a minute
    # to the dates. At least some labupcast files use identical collection times for the bottles
    # as for the downcast.
    data['datetime'] += pd.to_timedelta(1, 'min')
    
    for param in ['o2','chla','no3','no2','nh4','po4','sioh4']:
        # For each data column, make a view that drops the NaNs, then append
        # that view's station, time, depth, and column value to
        # outs. Create a filled parameter_id that corresponds to the
        # data column and append it as well.
        view = data.dropna(subset=param)

        processed_data = pd.DataFrame({
            'source_id': view['source_id'],
            'datetime': view['datetime'],
            'depth': view['depth'],
            'value': view[param],
            'location_id': view['location_id']
        })
        processed_data['parameter_id'] = param
        processed_data.to_sql('observations', con=thread_con, schema='obsdata',
                              index=False, if_exists='append')

with Pool(initializer=db_init) as p:
    p.map(process_cdf, glob.glob(bottle_cdfs))