### Description
This notebook contains all code used in the creation of the ToRies_Labprocessed2017_to_2021.csv file.

In [25]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import dateparser
from pytz import timezone
import shutil
import datetime

In [26]:
data_dir  = '../../data/'

In [27]:
concat_dir = data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/'
metadata_dir = concat_dir + 'metadata/'
if not os.path.exists(concat_dir):
    os.makedirs(concat_dir)
if not os.path.exists(metadata_dir):
    os.makedirs(metadata_dir)

# SeaGrant-TA_DIC-MassBayData/ Concatentation

In [28]:
sheet2017 = pd.read_excel(data_dir + 'SeaGrant-TA_DIC-MassBayData/ToRies_Labprocessed2017sharedSeaGlass.xlsx',
                          sheet_name=None)

sheet2018 = pd.read_excel(data_dir + 'SeaGrant-TA_DIC-MassBayData/ToRies_Labprocessed2018_sharedSeaGlass.xlsx',
                          sheet_name=None)

sheet2019 = pd.read_excel(data_dir + 'SeaGrant-TA_DIC-MassBayData/ToRies_Labprocessed2019_sharedSeaGlass.xlsx',
                          sheet_name=None)

sheet20202021 = pd.read_excel(data_dir + 'SeaGrant-TA_DIC-MassBayData/ToRies_Labprocessed2020and2021_sharedSeaGlass.xlsx',
                          sheet_name=None)

In [29]:
data2017 = sheet2017['batch_all2017p']
metadata2017 = sheet2017['readme']

data2018 = sheet2018['batch_all2018p']
metadata2018 = sheet2018['readme']

data2019 = sheet2019['batch_all2019p']
metadata2019 = sheet2019['readme2019']

data20202021 = sheet20202021['batch12_13_14']
metadata20202021 = sheet20202021['readme']

### Fix datetime issues 2017

In [30]:
# Convert 2017 data to datetime
for ind, row in data2017.iterrows():
    date_val = row['SampleDateTime']
    if type(date_val) == str:
        date = dateparser.parse(date_val)
        try:
            x = date.strftime("%Y")
            if x != "2017":
                print(date_val)
        except:
            print(date_val)
        data2017.loc[ind, 'SampleDateTime'] = date

  date_obj = stz.localize(date_obj)


### Fix datetime issues 2018

In [31]:
# Reformat unrecognized strings (2018)
for ind, row in data2018.iterrows():
    if isinstance(row['SampleDateTime'], str):
        s = row['SampleDateTime']
        if (s[4] != " " and s[2] != " "):
            new_string = s[0:4] + " " + s[4:6] + " " + s[6:]
            data2018.loc[ind, 'SampleDateTime'] = new_string

# Convert reformated strings to datetime (2018)
for ind, row in data2018.iterrows():
    val = row['SampleDateTime']
    if isinstance(val, str):
        date = dateparser.parse(val)
        data2018.loc[ind, 'SampleDateTime'] = date

### Fix datetime issues 2019

In [32]:
#Convert 2019 data from pandas timestamp to datetime
#data2019['SampleDateTime'] = [date.to_pydatetime() for date in data2019['SampleDateTime']]
for ind, row in data2019.iterrows():
    data2019.loc[ind, 'SampleDateTime'] = row['SampleDateTime'].to_pydatetime()

### Fix datetime issues 2020-2021

In [33]:
for ind, row in data20202021.iterrows():
  # If the date is a pandas timestamp, convert to datetime
  if isinstance(row['PROF_DATE_TIME_LOCAL'], pd.Timestamp):
    data20202021.loc[ind, 'PROF_DATE_TIME_LOCAL'] = row['PROF_DATE_TIME_LOCAL'].to_pydatetime()
  # If the date is a string, convert to datetime
  elif isinstance(row['PROF_DATE_TIME_LOCAL'], str):
    data20202021.loc[ind, 'PROF_DATE_TIME_LOCAL'] = dateparser.parse(row['PROF_DATE_TIME_LOCAL'])

### Fix column inconsistencies

In [34]:
column_conversion = {'BottleLabel': 'Station_D',
                      'EVENT_ID': 'SampleID',
                      'STAT_ID': 'StationID',
                      'PROF_DATE_TIME_LOCAL': 'SampleDateTime',
                      'LATITUDE': 'Latitude',
                      'LONGITUDE': 'Longitude',
                      'DEPTH (m)': 'Depth',
                      'CONDTVY (mS/cm)': 'Conductivity',
                      'DO_RAW (mg/L)': 'Dissolved Oxygen (Model 43)',
                      'FLU_RAW (ug/L)': 'Chla Fluor',
                      'PCT_SAT_RAW (PCT)': 'DO % Saturation',
                      'pH ()': 'pH <2>',
                      'SAL (PSU)': 'Salinity',
                      'SIGMA_T ()': 'Sigma-T',
                      'TEMP (C)': 'Temperature',
                      'TRANS (m-1)': 'Beam Attenuation',
                      'AT': 'TA in (mmol/kgSW)',
                      'CT': 'TCO2 in (mmol/kgSW)'
                      }
for col in column_conversion:
    data20202021[column_conversion[col]] = data20202021[col]
    # Remove the old column
    data20202021.drop(col, axis=1, inplace=True)
# Remove sampling code column
data20202021.drop('Unnamed: 7', axis=1, inplace=True)

In [35]:
# Add data source to 20202021
data20202021['Data Source'] = 'ToRies_Labprocessed2020and2021_sharedSeaGlass'

### 

### Validate columns and concatenate

In [36]:
intersection = set(data2017.columns).intersection(set(data2018.columns)).intersection(set(data2019.columns)).intersection(set(data20202021.columns))
union = set(data2017.columns).union(set(data2018.columns)).union(set(data2019.columns)).union(set(data20202021.columns))

In [37]:
not_in_common = []
for col in union:
    if col not in intersection:
        not_in_common.append(col)
print(f'Columns not in common: {not_in_common}')

Columns not in common: ['Air Irradiance', 'Pressure', 'Field Replicate', 'Salinity - Carolina', 'Water Irradiance', 'Sampling Depth', 'Sampling Date', 'Salinity - Ries ', 'Sampling T', 'Comments', 'Sampled Bottle Name', 'VINDTA Sample Name', 'Run T']


In [38]:
concat = pd.concat([data2017, data2018, data2019, data20202021], sort=False, ignore_index=True)

### Add CB Update to StationID and SampleID

In [39]:
cb_update = pd.read_csv(data_dir + 'SeaGrant-TA_DIC-MassBayData/ToRies_Labprocessed_CB_Update.csv')
for ind, row in concat.iterrows():
      # Get matching row in mwra batch
  match_row = cb_update[
    (abs(cb_update['t(oC) out'] - row['t(oC) out']) < 0.0001)
  & (abs(cb_update['P (dbars) out'] - row['P (dbars) out']) < 0.0001)
  & (abs(cb_update['pH out'] - row['pH out']) < 0.0001)
  & (abs(cb_update['fCO2 out (matm)'] - row['fCO2 out (matm)']) < 0.0001)]

  if len(match_row) == 1:
    concat.loc[ind, 'StationID'] = match_row['StationID'].values[0]
    concat.loc[ind, 'SampleID'] = match_row['SampleID'].values[0]
    concat.loc[ind, 'Station_D'] = match_row['Station_D'].values[0]

### Format Timestamps

In [40]:
# Drop microseconds and change timzeone to eastern
eastern = timezone('US/Eastern')
concat['SampleDateTime'] = [date.replace(microsecond=0).replace(tzinfo=eastern) for date in concat['SampleDateTime']]

#Convert to excel parsable string
concat['SampleDateTime'] = [date.strftime("%Y-%m-%d %H:%M:%S%z") for date in concat['SampleDateTime']]

### Handle StationIDs

In [41]:
# Strip station ID spaces
#concat['STAT_ID'] = [stat.strip(" ") for stat in concat['STAT_ID']]
for ind, row in concat.iterrows():
    if type(row['StationID']) == str:
        concat.loc[ind, 'StationID'] = row['StationID'].strip(" ")

# Add S identifier to Stellwagen bank station IDs
for ind, row in concat.iterrows():
  if row['SampleID'] == 'SBNMS':
    concat.loc[ind, 'StationID'] = 'S' + row['StationID']

### Handle Field Replicates

In [42]:
# Inject val qual column into mwra batch
# Get all non-null 

# Iterate over the rows with a val_qual defined
for ind, row in concat.iterrows():

  # Get comparator columns
  depth = row['Depth']
  time = row['SampleDateTime']
  
  # Get matching row in mwra batch
  match_row = concat[
    (concat['Depth'] == depth)
  & (concat['SampleDateTime'] == time)]

  if (len(match_row) == 2):
    #print(row['StationID'])
    inds = list(match_row.index.values)
    concat.loc[inds[0], 'Field Replicate'] = 'Y'
    concat.loc[inds[1], 'Field Replicate'] = 'Y'
    
  if (len(match_row) == 1):
    pass

  if (len(match_row) > 2):
    print("ERROR")

### Export as csv

In [43]:
concat.to_csv(data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/ToRies_Labprocessed2017_to_2021.csv', index=False)

#### Convert metadata to text files

In [44]:
def df_to_txt(df, txt_file):
    row_arr = []
    for ind, row in df.iterrows():
        cell_arr = []
        for cell in row:
            if str(cell) == 'nan':
                cell_arr.append(" ")
            else:
                cell_arr.append(str(cell))

        row_arr.append(" ".join(cell_arr))
    out = "\n".join(row_arr)
    text_file = open(txt_file, "w")
    n = text_file.write(out)
    text_file.close()

In [45]:
"""
df_to_txt(metadata2017,
          data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2017sharedSeaGlass_metadata.txt')
df_to_txt(metadata2018,
          data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2018_sharedSeaGlass_metadata.txt')
df_to_txt(metadata2019,
         data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2019_sharedSeaGlass_metadata.txt')
df_to_txt(metadata20202021,
data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2020and2021_sharedSeaGlass_metadata.txt')
"""

"\ndf_to_txt(metadata2017,\n          data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2017sharedSeaGlass_metadata.txt')\ndf_to_txt(metadata2018,\n          data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2018_sharedSeaGlass_metadata.txt')\ndf_to_txt(metadata2019,\n         data_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2019_sharedSeaGlass_metadata.txt')\ndf_to_txt(metadata20202021,\ndata_dir + 'SeaGrant-TA_DIC-MassBayData/concat/metadata/ToRies_Labprocessed2020and2021_sharedSeaGlass_metadata.txt')\n"