<a href="https://colab.research.google.com/github/adamggibbs/marine-carbonate-system-ml-prediction/blob/master/data_cleaning_ship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SET UP ENVIRONMENT 

# mount google drive for data storage and access
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install PyCO2SYS

Mounted at /content/drive
Collecting PyCO2SYS
  Downloading PyCO2SYS-1.7.0.1-py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 5.5 MB/s 
[?25hCollecting openpyxl>=3
  Downloading openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
[K     |████████████████████████████████| 243 kB 41.0 MB/s 
Collecting autograd-latest==1.3
  Downloading autograd_latest-1.3-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 5.4 MB/s 
Installing collected packages: openpyxl, autograd-latest, PyCO2SYS
  Attempting uninstall: openpyxl
    Found existing installation: openpyxl 2.5.9
    Uninstalling openpyxl-2.5.9:
      Successfully uninstalled openpyxl-2.5.9
Successfully installed PyCO2SYS-1.7.0.1 autograd-latest-1.3 openpyxl-3.0.7


In [None]:
# IMPORTS

import os
import sys

import pandas as pd
import numpy as np

import PyCO2SYS as pyco2


In [None]:
# STORE DIRECTORY WHERE DATA IS STORED

# this is an absolute path 
# and be sure to include trailing '/'
data_dir = '/content/drive/MyDrive/Adam Gibbs/data/'
ship_dir = data_dir + 'ship/'

# Ship Data
The following cells parse the ship data files. This section is split up into the following three code cells:

1.   Display Raw Data
2.   Parsing Function
3.   Parse Data and Create Input Array

In [None]:
# READ DATA INTO A PANDAS DATAFRAME TO DISPLAY

print('Files in ship data folder:')
display(os.listdir(ship_dir))

dfs_ship= []
for file in os.listdir(ship_dir):
  if file == 'raw':
    continue
  df = pd.read_csv(ship_dir + file)
  dfs_ship.append(df.dropna().reset_index(drop=True))

print()
print("There are {0} dataframes from {0} shipboard data files".format(len(dfs_ship)))
print("Adjust the index below to toggle display.")
print()

display(dfs_ship[1])
display(dfs_ship[2])

Files in ship data folder:


['raw',
 'mltrain_ship052019.csv',
 'mltrain_ship072019.csv',
 'mltrain_ship052021.csv']


There are 3 dataframes from 3 shipboard data files
Adjust the index below to toggle display.



Unnamed: 0,EXPOCODE,SECT_ID,STNNBR,CASTNO,BTLNBR,BTLNBR_FLAG_W,DATE,TIME,LATITUDE,LONGITUDE,CTDPRS,CTDTMP,CTDSAL,CTDSAL_FLAG_W,SALNTY,SALNTY_FLAG_W,CTDOXY,CTDOXY_FLAG_W,CTDPHIN_TOT,CTDPHIN_TOT_FLAG_W,OXYGEN,OXYGEN_FLAG_W,SILCAT,SILCAT_FLAG_W,NITRAT,NITRAT_FLAG_W,NITRIT,NITRIT_FLAG_W,PHSPHT,PHSPHT_FLAG_W,AMMONIA,AMMONIA_FLAG_W,TCARBN,TCARBN_FLAG_W,ALKALI,ALKALI_FLAG_W,PH_TOT,PH_TOT_FLAG_W,PHTMP,CHL_GFF,PHAEO_GFF,CHLPHAEO_FLAG_W,PH_INSITU_QF
0,C3PO,C1,1,1,1,1,20190723,2311,36.7982,-121.8462,202.3,8.5788,34.0377,2.0,34.0378,2,76.3,2,7.5775,2,101.9,4.0,43.182,2,31.881,2,0.118,2,2.660,4,0.049,2,2236.39,2,2259.50,2,7.478651,2,20,0.11,0.27,2,2.0
1,C3PO,C1,1,1,2,1,20190723,2311,36.7982,-121.8462,202.1,8.5774,34.0382,2.0,-999.0000,9,76.3,2,7.5797,2,78.4,2.0,40.007,2,29.474,2,0.145,2,2.837,4,-999.000,9,2236.65,2,-999.00,9,7.476772,2,20,0.13,0.31,2,2.0
2,C3PO,C1,1,1,3,1,20190723,2313,36.7982,-121.8462,151.2,8.8875,33.9995,2.0,-999.0000,9,84.8,2,7.5987,2,86.2,2.0,30.830,2,24.863,4,0.076,2,2.097,2,0.111,2,2226.93,2,2249.60,2,7.492670,2,20,0.11,0.33,2,2.0
3,C3PO,C1,1,1,4,1,20190723,2314,36.7982,-121.8462,101.3,9.4388,33.9235,2.0,-999.0000,9,100.2,2,7.6253,2,100.8,2.0,32.856,2,28.613,2,0.124,2,1.991,2,0.026,2,2208.39,2,-999.00,9,7.522627,2,20,0.30,0.48,2,2.0
4,C3PO,C1,1,1,5,1,20190723,2316,36.7982,-121.8462,80.6,9.6299,33.8970,2.0,33.9025,2,107.3,2,7.6357,2,107.2,2.0,30.828,2,27.744,2,0.154,2,2.036,2,0.037,2,2202.01,2,2239.36,2,7.535321,2,20,0.41,0.74,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,C3PO,73_55,20,1,8,1,20190729,1122,35.4770,-121.6088,51.5,11.4272,33.5325,2.0,-999.0000,9,192.9,4,7.9007,2,112.7,2.0,11.288,2,14.023,2,0.363,2,1.384,2,-999.000,9,2111.97,2,-999.00,9,7.577144,2,20,-999.00,-999.00,9,2.0
390,C3PO,73_55,20,1,9,1,20190729,1123,35.4770,-121.6088,25.3,13.4387,33.4912,2.0,-999.0000,9,234.7,4,8.0312,2,189.7,2.0,4.048,2,4.032,2,0.286,2,0.890,2,-999.000,9,2063.08,2,2236.52,2,7.746035,2,20,-999.00,-999.00,9,2.0
391,C3PO,73_55,20,1,10,1,20190729,1124,35.4770,-121.6088,10.7,15.8185,33.4956,2.0,-999.0000,9,260.9,4,8.1212,2,237.7,2.0,1.609,2,0.725,2,0.151,2,0.585,2,-999.000,9,2020.79,2,-999.00,9,7.890863,2,20,-999.00,-999.00,9,2.0
392,C3PO,73_55,20,1,11,1,20190729,1125,35.4770,-121.6088,5.3,15.8475,33.4995,2.0,-999.0000,9,262.2,2,8.1420,2,259.6,2.0,1.259,2,0.214,2,0.196,2,0.404,2,-999.000,9,2010.13,2,2238.70,2,7.990843,2,20,-999.00,-999.00,9,2.0


Unnamed: 0,EXPOCODE,SECT_ID,STNNBR,CASTNO,BTLNBR,BTLNBR_FLAG_W,DATE,TIME,LATITUDE,LONGITUDE,DEPTH,CTDPRS,CTDTMP,CTDSAL,CTDSAL_FLAG_W,SALNTY,SALNTY_FLAG_W,CTDOXY,CTDOXY_FLAG_W,DENS,OXYGEN,OXYGEN_FLAG_W,PH_TOT,PH_TOT_FLAG_W,PH_TMP,TCARBN,TCARBN_FLAG_W,NITRIT,NITRIT_FLAG_W,TALK,TALK_FLAG_W,PH_INSITU_QF
0,C3PO21,C1,1,1,1,1,20210508,1324,36.7978,-121.8465,5000,223.1,8.3491,34.0516,2.0,-999,9,73.3,2,1.02749,-999.000000,9.0,7.475188,2,20,2241.59,2,0.47,2,2270.77,4,2.0
1,C3PO21,C1,1,1,2,1,20210508,1325,36.7978,-121.8465,5000,201.2,8.4376,34.0454,2.0,-999,9,77.0,2,1.02737,-999.000000,9.0,7.480015,2,20,2237.71,2,0.47,2,2270.36,2,2.0
2,C3PO21,C1,1,1,3,1,20210508,1327,36.7978,-121.8465,5000,151.7,8.6281,34.0243,2.0,-999,9,84.8,2,1.02710,-999.000000,9.0,7.493929,2,20,2227.76,2,0.50,2,2265.04,2,2.0
3,C3PO21,C1,1,1,4,1,20210508,1329,36.7978,-121.8465,5000,100.4,8.7826,34.0049,2.0,-999,9,91.7,2,1.02683,-999.000000,9.0,7.508994,2,20,2222.55,2,0.41,2,2261.42,2,2.0
4,C3PO21,C1,1,1,5,1,20210508,1331,36.7978,-121.8465,5000,81.1,8.8404,33.9955,2.0,-999,9,96.8,2,1.02673,-999.000000,9.0,7.519989,2,20,2216.86,2,0.35,2,2259.26,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,C3PO21,80_70,23,1,8,1,20210514,38,33.8167,-121.8430,5000,302.1,7.2120,34.0700,2.0,-999,9,69.0,2,1.02804,76.871802,2.0,7.450365,2,20,2255.56,2,-999.00,9,2283.22,2,2.0
272,C3PO21,80_70,23,1,9,1,20210514,42,33.8167,-121.8430,5000,202.6,8.6914,34.0252,2.0,-999,9,105.4,2,1.02732,116.237214,2.0,7.546563,2,20,2209.23,2,-999.00,9,2265.80,2,2.0
273,C3PO21,80_70,23,1,10,1,20210514,46,33.8167,-121.8430,5000,100.6,10.4428,33.5476,2.0,-999,9,171.0,2,1.02620,185.334094,2.0,7.680108,2,20,2135.00,2,-999.00,9,2232.72,2,2.0
274,C3PO21,80_70,23,1,11,1,20210514,50,33.8167,-121.8430,5000,9.9,14.2987,32.9877,2.0,-999,9,257.4,2,1.02461,278.939792,2.0,7.951371,2,20,2014.16,2,-999.00,9,2214.63,2,2.0


In [None]:
# CREATE FUNCTION TO CREATE A NUMPY ARRAY OF INPUTS FROM
# SHIP DATA FILE

'''
process_ship_file()
  description:
    This function reads in a data file in csv format and
    creates a pandas dataframe from it. From there it loops through
    and removes all bad data points according to the quality control
    flags. It then takes the desired input parameters as sepcified on 
    line 74 and puts them into a numpy array.

  args:
    file: string that contains file name of dataset
'''
def process_ship_file(file):

  # read in csv
  df = pd.read_csv(file)
  # take subset of data we care about
  # df = df[['DATE', 'LATITUDE', 'LONGITUDE', 'CTDPRS', 'CTDTMP', 'CTDSAL', 
  #          'CTDSAL_FLAG_W', 'OXYGEN', 'OXYGEN_FLAG_W', 'TCARBN', 
  #          'TCARBN_FLAG_W', 'PH_TOT', 'PH_TOT_FLAG_W']]
  df = df[['DATE', 'LATITUDE', 'LONGITUDE', 'CTDPRS', 'CTDTMP', 'CTDSAL', 
           'CTDSAL_FLAG_W', 'OXYGEN', 'OXYGEN_FLAG_W', 'TCARBN', 
           'TCARBN_FLAG_W', 'PH_TOT', 'PH_TOT_FLAG_W']]

  # drop any row with NaN values
  df = df.dropna(axis=0, how='any').reset_index(drop=True)

  # remove all bad data
  # "bad data" has a QC flag that isn't 2
  # do this for every parameter and its QC flags

  # drop bad Salinity
  index = 0
  to_drop = []
  for flag in df['CTDSAL_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad Oxygen
  index = 0
  to_drop = []
  for flag in df['OXYGEN_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad Total Carbon
  index = 0
  to_drop = []
  for flag in df['TCARBN_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # drop bad Total pH
  index = 0
  to_drop = []
  for flag in df['PH_TOT_FLAG_W']:
    if flag != 2:
      to_drop.append(index)
    index += 1
  df = df.drop(to_drop).reset_index(drop=True)

  # Convert shipboard pH to in situ pH

  results = pyco2.sys(par1=df['PH_TOT'], par1_type=3,
                    par2=df['TCARBN'], par2_type=2,
                    temperature=25, pressure=0,
                    temperature_out=df['CTDTMP'], pressure_out=df['CTDPRS'],
                    salinity=df['CTDSAL'])

  df['PH_TOT'] = results['pH_out']

  # take subset of only parameters for inputs
  # this array contains only "good" data points
  inputs = df[['DATE', 'LATITUDE', 'LONGITUDE', 'CTDPRS', 'CTDTMP', 'CTDSAL', 
               'OXYGEN']]
  outputs = df['PH_TOT']

  # convert dataframe in numpy array
  inputs = inputs.to_numpy(dtype='str')
  outputs = outputs.to_numpy(dtype='str')

  # return the array
  return inputs, outputs


'''
process_ship_dir()
  description:
    This function takes a dir with shipboard data and processes
    each file in that dir and creates input and output files. It
    also has the option to save those input and output arrays to
    either a .txt or a .csv file.

  args:
    file: string that contains file name of dataset
    save_txt (default=False): boolean of whether to save a .txt
    save_csv (default=False): boolean of whether to save a .csv
'''
def process_ship_dir(dir, save_txt=False, save_csv=False):
  # list all data files in directory
  print('Parsing the following files:')

  # create an empty numpy array that will hold inputs
  inputs = np.empty((0,7))
  outputs = np.empty(0)

  # loop through all data files and add them to input array
  for file in os.listdir(dir):
    if file == 'raw':
      continue
    print(ship_dir + file)
    input_array, output_array = process_ship_file(dir + file)
    inputs = np.concatenate((inputs, input_array), axis=0)
    outputs = np.concatenate((outputs, output_array), axis=0)

  # if desired save arrays as .txt files
  if save_txt or save_csv: 
    if save_txt:
      input_header = 'DATE LATITUDE LONGITUDE CTDPRS CTDTMP CTDSAL OXYGEN'
      np.savetxt(data_dir + 'ship_tpso_input.txt', inputs, 
                 fmt='%s', header=input_header)
      output_header = 'TOT_PH'
      np.savetxt(data_dir + 'ship_ph_output.txt', outputs, 
                 fmt='%s', header=output_header)
    else:
      input_header = 'DATE, LATITUDE, LONGITUDE, CTDPRS, CTDTMP, CTDSAL, OXYGEN'
      np.savetxt(data_dir + 'ship_tpso_input.csv', inputs, 
                 fmt='%s', delimiter=",", header=input_header)
      output_header = 'TOT_PH'
      np.savetxt(data_dir + 'ship_ph_output.csv', outputs, 
                 fmt='%s', delimiter=",", header=output_header)

  return inputs, outputs


In [None]:
# PARSE SHIP DATA AND CREATE INPUT ARRAY

ship_inputs, ship_outputs = process_ship_dir(ship_dir, save_csv=True)

# print input array and its dimensions
# print('Input array:')
# display(ship_inputs)

print('Dimensions of input array:')
display(ship_inputs.shape)

# print output array and its dimensions
# print('Output array:')
# display(ship_outputs)

print('Dimensions of output array:')
display(ship_outputs.shape)

Parsing the following files:
/content/drive/MyDrive/Adam Gibbs/data/ship/mltrain_ship052019.csv
/content/drive/MyDrive/Adam Gibbs/data/ship/mltrain_ship072019.csv
/content/drive/MyDrive/Adam Gibbs/data/ship/mltrain_ship052021.csv
Dimensions of input array:


(747, 7)

Dimensions of output array:


(747,)

In [None]:
# Dimensions of output array:
# (748,)