<a href="https://colab.research.google.com/github/adamggibbs/marine-carbonate-system-ml-prediction/blob/master/data_cleaning_glider.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SET UP ENVIRONMENT 

# mount google drive for data storage and access
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#lib install
#!pip install PYCO2SYS

Mounted at /content/drive


In [None]:
# IMPORTS

import os
#import sys

import pandas as pd
import numpy as np

#import PyCO2SYS as pyco2


In [None]:
# STORE DIRECTORY WHERE DATA IS STORED

# this is an absolute path 
# and be sure to include trailing '/'
data_dir = '/content/drive/MyDrive/Adam Gibbs/data/'
glider_dir = data_dir + 'glider/'

# Glider Data
The following cells parse the glider data files. This section is split up into the following three code cells:

1.   Reading Glider Data File Function
2.   Display Raw Data
3.   Parsing Function
4.   Parse Data and Create Input Array



In [None]:
# FUNCTION TO READ GLIDER FILE IN A PANDAS DATAFRAME
def read_glider_file(file):
  names = [ 'Cruise', 'Station', 'Type',	'DATE', 'TIME', 'LONGITUDE',	
         'LATITUDE',	'QF',	'PRS', 'PRS_QF', 'TMP', 'TMP_QF', 
         'SAL', 'SAL_QF', 'Sigma_theta', 'ST_QF', 'DEPTH', 'DEPTH_QF', 
         'OXYGEN', 'OXYGEN_QF',	'SATOXY',	'SATOXY_QF',	'NITRATE', 
         'NITRATE_QF', 'CHL_A', 'CHL_A_QF', 'BBP700', 'BBP700_QF', 'PH_INSITU', 
         'PH_INSITU_QF', 'BBP532', 'BBP_532_QF', 'CDOM', 'CDOM_QF', 'TALK_CANYONB',	
         'TALK_QF', 'DIC_CANYONB', 'DIC_QF', 'pCO2_CANYONB', 'pCO2_QF', 
         'SAT_AR_CANYONB', 'SAT_AR_QF', 'pH25C_1atm', 'pH25C_1atm_QF' ]

  df = pd.read_csv(file, header=0, sep=',')
  df = df.dropna(axis=0, how='any').reset_index(drop=True)
  
  return df


In [None]:
# READ DATA INTO A PANDAS DATAFRAME TO DISPLAY

print('Files in glider data folder:')
display(os.listdir(glider_dir))

dfs_glider = []
for file in os.listdir(glider_dir):
  if file == 'raw':
    continue
  dfs_glider.append(read_glider_file(glider_dir + file))

print()
print("There are {0} dataframes from {0} shipboard data files".format(len(dfs_glider)))
print("Adjust the index below to toggle which one is displayed.")
print()
display(dfs_glider[0])

Files in glider data folder:


['raw',
 '19502902.csv',
 '19702901.csv',
 '19A02901.csv',
 '20202901.csv',
 '20A02901.csv',
 '21202901.csv']


There are 6 dataframes from 6 shipboard data files
Adjust the index below to toggle which one is displayed.



Unnamed: 0,Cruise,Station,Type,DATE,TIME,LONGITUDE,LATITUDE,QF,PRS,PRS_QF,TMP,TMP_QF,SAL,SAL_QF,Sigma_theta,ST_QF,DEPTH,DEPTH_QF,OXYGEN,OXYGEN_QF,SATOXY,SATOXY_QF,NITRATE,NITRATE_QF,CHL_A,CHL_A_QF,BBP700,BBP700_QF,PH_INSITU,PH_INSITU_QF,BBP532,BBP_532_QF,CDOM,CDOM_QF,TALK_CANYONB,TALK_QF,DIC_CANYONB,DIC_QF,pCO2_CANYONB,pCO2_QF,SAT_AR_CANYONB,SAT_AR_QF,pH25C_1atm,pH25C_1atm_QF
0,19502902,1,C,05/28/2019,18:28,-121.842,36.793,0,56.36,0.0,8.997,0.0,33.917,0.0,26.527,0.0,55.937,0.0,152.25,8.0,53.8,8.0,-1.000000e+10,1.0,0.573,0.0,-1.000000e+10,1.0,7.7374,8.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2260.0,8.0,2198.0,8.0,854.2,8.0,0.991,8.0,7.5181,8.0
1,19502902,1,C,05/28/2019,18:28,-121.842,36.793,0,56.08,0.0,8.997,0.0,33.916,0.0,26.525,0.0,55.659,0.0,136.35,8.0,48.2,8.0,-1.000000e+10,1.0,0.535,0.0,-1.000000e+10,1.0,7.7375,8.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2260.0,8.0,2197.0,8.0,853.8,8.0,0.991,8.0,7.5182,8.0
2,19502902,1,C,05/28/2019,18:28,-121.842,36.793,0,55.52,0.0,9.000,0.0,33.913,0.0,26.520,0.0,55.103,0.0,127.93,8.0,45.2,8.0,-1.000000e+10,1.0,0.562,0.0,-1.000000e+10,1.0,7.7326,8.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2259.0,8.0,2198.0,8.0,864.0,8.0,0.980,8.0,7.5137,8.0
3,19502902,1,C,05/28/2019,18:28,-121.842,36.793,0,55.00,0.0,9.006,0.0,33.912,0.0,26.515,0.0,54.587,0.0,122.19,8.0,43.2,8.0,-1.000000e+10,1.0,0.628,0.0,-1.000000e+10,1.0,7.7311,0.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2259.0,8.0,2199.0,8.0,867.3,8.0,0.977,8.0,7.5124,8.0
4,19502902,1,C,05/28/2019,18:28,-121.842,36.793,0,54.52,0.0,9.010,0.0,33.910,0.0,26.511,0.0,54.111,0.0,118.14,8.0,41.8,8.0,-1.000000e+10,1.0,0.586,0.0,-1.000000e+10,1.0,7.7311,0.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2259.0,8.0,2198.0,8.0,867.3,8.0,0.977,8.0,7.5124,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54047,19502902,190,C,06/10/2019,15:31,-121.828,36.792,0,2.60,0.0,10.617,0.0,33.711,0.0,25.851,0.0,2.581,0.0,162.81,0.0,59.5,0.0,-1.000000e+10,1.0,0.484,0.0,-1.000000e+10,1.0,7.8251,0.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2248.0,0.0,2152.0,0.0,692.3,0.0,1.264,0.0,7.6199,0.0
54048,19502902,190,C,06/10/2019,15:31,-121.828,36.792,0,1.64,0.0,11.407,0.0,33.674,0.0,25.677,0.0,1.628,0.0,169.07,0.0,62.8,0.0,-1.000000e+10,1.0,0.570,0.0,-1.000000e+10,1.0,7.8451,0.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2248.0,0.0,2141.0,0.0,660.3,0.0,1.357,0.0,7.6499,0.0
54049,19502902,190,C,06/10/2019,15:31,-121.828,36.792,0,0.80,0.0,11.758,0.0,33.651,0.0,25.590,0.0,0.794,0.0,177.19,0.0,66.3,0.0,-1.000000e+10,1.0,0.594,0.0,-1.000000e+10,1.0,7.8947,0.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2248.0,0.0,2122.0,0.0,583.0,0.0,1.522,0.0,7.7019,0.0
54050,19502902,190,C,06/10/2019,15:31,-121.828,36.792,0,-0.08,0.0,12.479,0.0,33.590,0.0,25.402,0.0,-0.079,0.0,192.58,0.0,73.2,0.0,-1.000000e+10,1.0,0.885,0.0,-1.000000e+10,1.0,7.9310,0.0,-1.000000e+10,1.0,-1.000000e+10,1.0,2246.0,0.0,2103.0,0.0,532.0,0.0,1.681,0.0,7.7471,0.0


In [None]:
# CREATE FUNCTION TO CREATE A NUMPY ARRAY OF INPUTS FROM
# GLIDER DATA FILE

'''
process_glider_input()
  description:
    This function reads in a data file in csv format and
    creates a pandas dataframe from it. From there it loops through
    and removes all bad data points according to the quality control
    flags. It then takes the desired input parameters as sepcified on 
    line 84 and puts them into a numpy array.

  args:
    file: string that contains file name of dataset
'''
def process_glider_file(file, save_txt=False, save_csv=False):

  # read in csv
  df = read_glider_file(file)

  # throw away first day
  start_date = int(df['DATE'][0][3:5])
  start_time = float(df['TIME'][0][0:2]) + float(df['TIME'][0][3:5]) / 60

  drop_index = 0
  for index, row in df.iterrows():
    curr_date = int(row['DATE'][3:5])
    curr_time = float(row['TIME'][0:2]) + float(row['TIME'][3:5]) / 60
    if (curr_date > start_date and curr_time > start_time) or curr_date > start_date + 1:
      drop_index = index
      break

  # drop first day of data
  df = df.drop(index=df.index[:drop_index], axis=0).reset_index(drop=True)

  # take only data we care about
  df = df[['DATE', 'LATITUDE', 'LONGITUDE', 'PRS', 'PRS_QF', 'TMP', 'TMP_QF',
          'SAL', 'SAL_QF', 'OXYGEN', 'OXYGEN_QF', 'SATOXY', 'SATOXY_QF',
          'PH_INSITU', 'PH_INSITU_QF', 'TALK_CANYONB', 'TALK_QF', 'DIC_CANYONB',
          'DIC_QF', 'pCO2_CANYONB', 'pCO2_QF']]

  # drop bad pressure
  index = 0
  to_drop = []
  for flag in df['PRS_QF']:
    if flag > 0:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad temperature
  index = 0
  to_drop = []
  for flag in df['TMP_QF']:
    if flag > 0:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad salinity
  index = 0
  to_drop = []
  for flag in df['SAL_QF']:
    if flag > 0:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad oxygen
  index = 0
  to_drop = []
  for flag in df['OXYGEN_QF']:
    if flag > 0:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad saturated oxygen
  index = 0
  to_drop = []
  for flag in df['SATOXY_QF']:
    if flag > 0:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad pH
  index = 0
  to_drop = []
  for flag in df['PH_INSITU_QF']:
    if flag > 0:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # take subset of only parameters for inputs
  # this array contains only "good" data points
  inputs = df[['DATE', 'LATITUDE', 'LONGITUDE', 'PRS', 'TMP', 'SAL', 'OXYGEN']]
  outputs = df['PH_INSITU']
  # convert dataframe in numpy array
  inputs = inputs.to_numpy(dtype='str')
  outputs = outputs.to_numpy(dtype='str')

  # change date format
  for row in inputs:
    date = row[0]
    row[0] = date[6:10] + date[0:2] + date[3:5]

  # return the array
  return inputs, outputs


'''
process_glider_dir()
  description:
    This function takes a dir with shipboard data and processes
    each file in that dir and creates input and output files. It
    also has the option to save those input and output arrays to
    either a .txt or a .csv file.

  args:
    file: string that contains file name of dataset
    save_txt (default=False): boolean of whether to save a .txt
    save_csv (default=False): boolean of whether to save a .csv
'''
def process_glider_dir(dir, save_txt=False, save_csv=False):
  # list all data files in directory
  print('Parsing the following files:')

  # create an empty numpy array that will hold inputs
  inputs = np.empty((0,7))
  outputs = np.empty(0)

  # loop through all data files and add them to input array
  for file in os.listdir(glider_dir):
    # if file == 'raw' or file == '19702901.csv' or file == '19A02901.csv':
    if file == 'raw':
      continue
    print(glider_dir + file)
    input_array, output_array = process_glider_file(glider_dir + file)
    inputs = np.concatenate((inputs, input_array), axis=0)
    outputs = np.concatenate((outputs, output_array), axis=0)

  # if desired save arrays as .txt files
  if save_txt or save_csv: 
    if save_txt:
      input_header = 'DATE LATITUDE LONGITUDE PRS TMP SAL OXYGEN'
      np.savetxt(data_dir + 'glider_tpso_input.txt', inputs, 
                 fmt='%s', header=input_header)
      output_header = 'pH'
      np.savetxt(data_dir + 'glider_ph_output.txt', outputs, 
                 fmt='%s', header=output_header)
    else:
      input_header = 'DATE, LATITUDE, LONGITUDE, PRS, TMP, SAL, OXYGEN'
      np.savetxt(data_dir + 'glider_tpso_input.csv', inputs, 
                 fmt='%s', delimiter=",", header=input_header)
      output_header = 'pH'
      np.savetxt(data_dir + 'glider_ph_output.csv', outputs, 
                 fmt='%s', delimiter=",", header=output_header)

  return inputs, outputs


In [None]:
# CREATE INPUT FILE FROM GLIDER DATA

glider_inputs, glider_outputs = process_glider_dir(glider_dir, save_csv=True)

# print input array and its dimensions
print()
print('Input array:')
display(glider_inputs)
print()

print('Dimensions of input array:')
display(glider_inputs.shape)
print()

# print output array and its dimensions
print('Output array:')
display(glider_outputs)
print()

print('Dimensions of output array:')
display(glider_outputs.shape)
print()

Parsing the following files:
/content/drive/MyDrive/Adam Gibbs/data/glider/19502902.csv
/content/drive/MyDrive/Adam Gibbs/data/glider/19702901.csv
/content/drive/MyDrive/Adam Gibbs/data/glider/19A02901.csv
/content/drive/MyDrive/Adam Gibbs/data/glider/20202901.csv
/content/drive/MyDrive/Adam Gibbs/data/glider/20A02901.csv
/content/drive/MyDrive/Adam Gibbs/data/glider/21202901.csv

Input array:


array([['20190529', '36.747', '-122.061', ..., '5.947', '34.222',
        '20.52'],
       ['20190529', '36.747', '-122.061', ..., '5.952999999999999',
        '34.218', '20.28'],
       ['20190529', '36.747', '-122.061', ..., '5.97', '34.216', '20.03'],
       ...,
       ['20210527', '36.799', '-121.855', ..., '11.569', '33.369',
        '285.22'],
       ['20210527', '36.799', '-121.855', ..., '11.684', '33.304',
        '284.18'],
       ['20210527', '36.799', '-121.855', ..., '11.491', '33.503',
        '282.94']], dtype='<U32')


Dimensions of input array:


(1003739, 7)


Output array:


array(['7.537999999999999', '7.5375', '7.5363', ..., '8.0364', '8.0318',
       '8.0358'], dtype='<U32')


Dimensions of output array:


(1003739,)




In [None]:
count = 0
for o in glider_outputs:
  if float(o) < 7.3 or float(o) > 8.5:
    count += 1
display(count)

0