### Description
This notebook contains all code used in the creation of the MWRA_TA_DIC_2017_to_2022.csv file.

# Concatenating MWRA and SeaGrant data

In [1]:
import pandas as pd
import datetime
import dateparser
from pytz import timezone

In [2]:
data_dir = '../../data/'

In [17]:
mwra = pd.read_csv(data_dir + 'MassBay/concat/MassBay_2017_to_2022_upcast_update.csv',
parse_dates = ['STAT_ARRIV'])
seagrant = pd.read_csv(data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/ToRies_Labprocessed2017_to_2021.csv',
parse_dates = ['SampleDateTime'])

#### Create column mappings

In [18]:
# Define column mappings
mwra_to_upcast_mwra = {
 'PROF_DATE_TIME_LOCAL' : 'STAT_ARRIV',
 'SAMPLE_DEPTH_CODE': 'DEPTH_MARKER',
 'DEPTH (m)': 'PROFILE_DEPTH',
 'CONDTVY (mS/cm)': 'CONDTVY',
'DISS_OXYGEN (mg/L)' : 'DISS_OXYGEN',
'FLUORESCENCE (ug/L)' : 'FLUORESCENCE',
'FLU_RAW (ug/L)' : 'FLU_RAW',
'PCT_SAT (PCT)' : 'PCT_SAT',
'pH ()': 'PH',
 'SAL (PSU)' : 'SAL',
'SIGMA_T ()': 'SIGMA_T',
 'TEMP (C)' : 'TEMP',
'TRANS (m-1)' : 'TRANS'}

mwra_to_seagrant = {'EVENT_ID' : 'SampleID',
 'STAT_ID' : 'StationID',
 'PROF_DATE_TIME_LOCAL' : 'SampleDateTime',
 'LATITUDE' : 'Latitude',
 'LONGITUDE' : 'Longitude',
 'DEPTH (m)': 'Depth',
 'CONDTVY (mS/cm)': 'Conductivity',
'DISS_OXYGEN (mg/L)' : 'Dissolved Oxygen (Model 43)',
'FLU_RAW (ug/L)' : 'Chla Fluor',
'PCT_SAT (PCT)' : 'DO % Saturation',
'pH ()': 'pH <2>',
 'SAL (PSU)' : 'Salinity',
'SIGMA_T ()': 'Sigma-T',
 'TEMP (C)' : 'Temperature',
'TRANS (m-1)' : 'Beam Attenuation'}

In [19]:
# Apply column mappings
for mwra_col in mwra_to_upcast_mwra:
  mwra[mwra_col] = mwra[mwra_to_upcast_mwra[mwra_col]]
  mwra.drop(mwra_to_upcast_mwra[mwra_col], axis=1, inplace=True)

for mwra_col in mwra_to_seagrant:
    seagrant[mwra_col] = seagrant[mwra_to_seagrant[mwra_col]]
    seagrant.drop([mwra_to_seagrant[mwra_col]], axis=1, inplace=True)

#### Format timestamps

In [20]:
# Convert MWRA timestamps to datetime
date_arr = mwra['PROF_DATE_TIME_LOCAL'].dt.to_pydatetime()
eastern = timezone('US/Eastern')
formatted = [date.replace(microsecond=0).replace(tzinfo=eastern) for date in date_arr]
mwra["PROF_DATE_TIME_LOCAL"] = pd.Series(formatted, dtype="object")

In [21]:
len(seagrant[seagrant['LONGITUDE'].isnull()])
seagrant_copy = seagrant.copy()

#### Left Join MWRA data onto Seagrant for easier concatenation

In [22]:
# DIC TO UPCAST MATCHING
# Add sample codes and dates to seagrant
matches  = 0
no_match = 0
overlaps = 0
for ind, row in seagrant.iterrows():
  depth = row['DEPTH (m)']
  lat = row['LATITUDE']
  lon = row['LONGITUDE']

  match_row = mwra[
  (abs(mwra['DEPTH (m)'] - depth) <= 3)
  & (abs(mwra['LATITUDE'] - lat) <= 0.000001)
  & (abs(mwra['LONGITUDE'] - lon) <= 0.000001)
  ]

  if (len(match_row) == 1):
    if pd.isnull(row['LATITUDE']):
      print(row, match_row)
    match_ind = match_row.index.values[0]
    matches += 1
    
    for mwra_col in mwra.columns:
      if not pd.isnull(mwra[mwra_col][match_ind]) and mwra_col != 'Data Source':
        seagrant.loc[ind, mwra_col] = mwra.loc[match_ind, mwra_col]
      
  if (len(match_row) < 1):
    no_match += 1
  if (len(match_row) > 1):
    overlaps += 1

print("Matches:", matches)
print("No Matches:", no_match)
print("Overlaps:", overlaps)

wn_samples = []
# Select rows that have the string "WN" in the EVENT_ID column
for ind, row in seagrant.iterrows():
  if type(row['EVENT_ID'])  == str:
    if "WN" in row['EVENT_ID']:
      wn_samples.append(row['EVENT_ID'])

print(f'WN Samples in TA_DIC: {len(wn_samples)}')

Matches: 318
No Matches: 172
Overlaps: 0
WN Samples in TA_DIC: 318


In [27]:
seagrant[(seagrant_copy['LATITUDE'].isnull())]['LATITUDE']

437   NaN
438   NaN
439   NaN
440   NaN
441   NaN
442   NaN
443   NaN
444   NaN
445   NaN
446   NaN
447   NaN
448   NaN
449   NaN
450   NaN
451   NaN
452   NaN
480   NaN
481   NaN
482   NaN
483   NaN
484   NaN
485   NaN
486   NaN
487   NaN
488   NaN
489   NaN
Name: LATITUDE, dtype: float64

#### Retrieve rows exclusive to MWRA file

In [123]:
# Outer Join (Concatenate) MWRA and Seagrant
combined = pd.concat([mwra, seagrant], ignore_index=True, sort=False)

# Drop MWRA samples that exist in seagrant data (keep last because seagrant was concatenated last)
no_dupes = combined.drop_duplicates(subset = ['LATITUDE', 'LONGITUDE', 'DEPTH (m)'], keep = 'last')

# Isolate remaining MWRA samples 
mwra_no_dupes = no_dupes[no_dupes['Data Source'] == 'MWRA_MassBay_upcast_2017-202205']

c

#### Concatenate exclusively MWRA rows with Seagrant

In [124]:
# Concatenate these rows with DIC data
concat = pd.concat([mwra_no_dupes, seagrant], ignore_index=True, sort=False)

#### Post-process concatenation

In [125]:
# Strip station ID spaces
for ind, row in concat.iterrows():
    if type(row['STAT_ID']) == str:
        concat.loc[ind, 'STAT_ID'] = row['STAT_ID'].strip(" ")

In [126]:
# Format timestamps
concat.sort_values(by='PROF_DATE_TIME_LOCAL', inplace = True)

concat["PROF_DATE_TIME_LOCAL"] = [date.strftime("%-m/%d/%y %H:%M") for date in concat["PROF_DATE_TIME_LOCAL"]]

In [127]:
# Send VAL_QUAL next to pH and Data Source column to the end of the dataframe
cols = concat.columns.tolist()
cols.remove('VAL_QUAL')
cols.insert(cols.index('pH ()') + 1, 'VAL_QUAL')
cols.remove('Data Source')
cols.append('Data Source')
concat = concat[cols]

#### Export as csv

In [128]:
concat.to_csv(data_dir + 'concat/MWRA_TA_DIC_2017_to_2022_test.csv', index = False)

In [129]:
# Concat metadata file
def print_column_descriptions(df):
    ind = 1
    for col in df.columns:
        print(f'Column {ind}: {col} is COLUMN DESCRIPTION')
        ind += 1
print_column_descriptions(pd.read_csv(data_dir + 'concat/MWRA_TA_DIC_2017_to_2022.csv'))

Column 1: STUDY_ID is COLUMN DESCRIPTION
Column 2: EVENT_ID is COLUMN DESCRIPTION
Column 3: STAT_ID is COLUMN DESCRIPTION
Column 4: ORDERED_DEPTH_CODE is COLUMN DESCRIPTION
Column 5: SAMPLE_ID is COLUMN DESCRIPTION
Column 6: SAMPLE_DEPTH_TOP is COLUMN DESCRIPTION
Column 7: SAMPLE_DEPTH_BOTTOM is COLUMN DESCRIPTION
Column 8: LATITUDE is COLUMN DESCRIPTION
Column 9: LONGITUDE is COLUMN DESCRIPTION
Column 10: VAL_QUAL is COLUMN DESCRIPTION
Column 11: Data Source is COLUMN DESCRIPTION
Column 12: PROF_DATE_TIME_LOCAL is COLUMN DESCRIPTION
Column 13: SAMPLE_DEPTH_CODE is COLUMN DESCRIPTION
Column 14: DEPTH (m) is COLUMN DESCRIPTION
Column 15: CONDTVY (mS/cm) is COLUMN DESCRIPTION
Column 16: DISS_OXYGEN (mg/L) is COLUMN DESCRIPTION
Column 17: FLUORESCENCE (ug/L) is COLUMN DESCRIPTION
Column 18: FLU_RAW (ug/L) is COLUMN DESCRIPTION
Column 19: PCT_SAT (PCT) is COLUMN DESCRIPTION
Column 20: pH () is COLUMN DESCRIPTION
Column 21: SAL (PSU) is COLUMN DESCRIPTION
Column 22: SIGMA_T () is COLUMN DES