<a href="https://colab.research.google.com/github/adamggibbs/marine-carbonate-system-ml-prediction/blob/master/data_preprocessing_ship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SET UP ENVIRONMENT 

# mount google drive for data storage and access
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

#lib install
!pip install PYCO2SYS

In [None]:
# IMPORTS

import os
import sys

import pandas as pd
import numpy as np

import PyCO2SYS as pyco2


In [None]:
# STORE DIRECTORY WHERE DATA IS STORED

# this is an absolute path 
# and be sure to include trailing '/'
data_dir = '/content/drive/MyDrive/Adam Gibbs/data/'
ship_dir = data_dir + 'ship/'

# Ship Data
The following cells parse the ship data files. This section is split up into the following three code cells:

1.   Display Raw Data
2.   Parsing Function
3.   Parse Data and Create Input Array

In [None]:
# READ DATA INTO A PANDAS DATAFRAME TO DISPLAY

print('Files in ship data folder:')
display(os.listdir(ship_dir))

dfs_ship= []
for file in os.listdir(ship_dir):
  df = pd.read_csv(ship_dir + file)
  dfs_ship.append(df.dropna().reset_index(drop=True))

print()
print("There are {0} dataframes from {0} shipboard data files".format(len(dfs_ship)))
print("Adjust the index below to toggle display.")
print()

display(dfs_ship[0])

In [None]:
# CREATE FUNCTION TO CREATE A NUMPY ARRAY OF INPUTS FROM
# SHIP DATA FILE

'''
process_ship_file()
  description:
    This function reads in a data file in csv format and
    creates a pandas dataframe from it. From there it loops through
    and removes all bad data points according to the quality control
    flags. It then takes the desired input parameters as sepcified on 
    line 74 and puts them into a numpy array.

  args:
    file: string that contains file name of dataset
'''
def process_ship_file(file):

  # read in csv
  df = pd.read_csv(file)
  # take subset of data we care about
  df = df[['DATE', 'LATITUDE', 'LONGITUDE', 'CTDPRS', 'CTDTMP', 'CTDSAL', 
           'CTDSAL_FLAG_W', 'OXYGEN', 'OXYGEN_FLAG_W', 'TCARBN', 
           'TCARBN_FLAG_W', 'PH_TOT', 'PH_TOT_FLAG_W']]
  # drop any row with NaN values
  df = df.dropna(axis=0, how='any').reset_index(drop=True)

  # remove all bad data
  # "bad data" has a QC flag that isn't 2
  # do this for every parameter and its QC flags

  # drop bad Salinity
  index = 0
  to_drop = []
  for flag in df['CTDSAL_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad Oxygen
  index = 0
  to_drop = []
  for flag in df['OXYGEN_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad Total Carbon
  index = 0
  to_drop = []
  for flag in df['TCARBN_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad Total pH
  index = 0
  to_drop = []
  for flag in df['PH_TOT_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # Convert shipboard pH to in situ pH

  results = pyco2.sys(par1=df['PH_TOT'], par1_type=3,
                    par2=df['TCARBN'], par2_type=2,
                    temperature=25, pressure=0,
                    temperature_out=df['CTDTMP'], pressure_out=df['CTDPRS'],
                    salinity=df['CTDSAL'])

  df['PH_TOT'] = results['pH_out']

  # take subset of only parameters for inputs
  # this array contains only "good" data points
  inputs = df[['DATE', 'LATITUDE', 'LONGITUDE', 'CTDPRS', 'CTDTMP', 'CTDSAL', 
               'OXYGEN']]
  outputs = df['PH_TOT']

  # convert dataframe in numpy array
  inputs = inputs.to_numpy()
  outputs = outputs.to_numpy()

  # return the array
  return inputs, outputs


'''
process_ship_dir()
  description:
    This function takes a dir with shipboard data and processes
    each file in that dir and creates input and output files. It
    also has the option to save those input and output arrays to
    either a .txt or a .csv file.

  args:
    file: string that contains file name of dataset
    save_txt (default=False): boolean of whether to save a .txt
    save_csv (default=False): boolean of whether to save a .csv
'''
def process_ship_dir(dir, save_txt=False, save_csv=False):
  # list all data files in directory
  print('Parsing the following files:')
  print(os.listdir(dir))

  # create an empty numpy array that will hold inputs
  inputs = np.empty((0,7))
  outputs = np.empty(0)

  # loop through all data files and add them to input array
  for file in os.listdir(dir):
    input_array, output_array = process_ship_file(dir + file)
    inputs = np.concatenate((inputs, input_array), axis=0)
    outputs = np.concatenate((outputs, output_array), axis=0)

  # if desired save arrays as .txt files
  if save_txt or save_csv: 
    if save_txt:
      input_header = 'DATE LATITUDE LONGITUDE CTDPRS CTDTMP CTDSAL OXYGEN'
      np.savetxt(data_dir + 'ship_tpso_input.txt', inputs, 
                 fmt='%s', header=input_header)
      output_header = 'TOT_PH'
      np.savetxt(data_dir + 'ship_ph_output.txt', outputs, 
                 fmt='%s', header=output_header)
    else:
      input_header = 'DATE, LATITUDE, LONGITUDE, CTDPRS, CTDTMP, CTDSAL, OXYGEN'
      np.savetxt(data_dir + 'ship_tpso_input.csv', inputs, 
                 fmt='%s', delimiter=",", header=input_header)
      output_header = 'TOT_PH'
      np.savetxt(data_dir + 'ship_ph_output.csv', outputs, 
                 fmt='%s', delimiter=",", header=output_header)

  return inputs, outputs


In [None]:
# PARSE SHIP DATA AND CREATE INPUT ARRAY

ship_inputs, ship_outputs = process_ship_dir(ship_dir, save_csv=True)

# print input array and its dimensions
print('Input array:')
display(ship_inputs)

print('Dimensions of input array:')
display(ship_inputs.shape)

# print output array and its dimensions
print('Output array:')
display(ship_outputs)

print('Dimensions of output array:')
display(ship_outputs.shape)