<a href="https://colab.research.google.com/github/adamggibbs/marine-carbonate-system-ml-prediction/blob/master/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# IMPORTS

import os
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# data dirs
data_dir = '/content/drive/MyDrive/Adam Gibbs/data/'
training_dir=data_dir + 'training/'

dyear = True
w_pres = True

In [None]:
# load in txt data

ship_input = np.loadtxt(data_dir+'ship_tpso_input.csv', 
                        delimiter=",", 
                        dtype='str')
ship_output = np.loadtxt(data_dir+'ship_ph_output.csv', 
                        delimiter=",", 
                        dtype='str')

glider_input = np.loadtxt(data_dir+'glider_tpso_input.csv', 
                        delimiter=",", 
                        dtype='str')
glider_output = np.loadtxt(data_dir+'glider_ph_output.csv', 
                        delimiter=",", 
                        dtype='str')

wcoa_input = np.loadtxt(data_dir+'wcoa_tpso_input.csv', 
                        delimiter=",", 
                        dtype='str')
wcoa_output = np.loadtxt(data_dir+'wcoa_ph_output.csv', 
                        delimiter=",", 
                        dtype='str')

display(ship_input[:6])
display(ship_output[:6])

display(glider_input[:6])
display(glider_output[:6])

display(wcoa_input[:6])
display(wcoa_output[:6])


array([['20190511', '36.798', '-121.8462', '202.6', '8.6075', '33.9971',
        '94.2'],
       ['20190511', '36.798', '-121.8462', '202.8', '8.581', '34.001',
        '93.3'],
       ['20190511', '36.798', '-121.8462', '152.1', '9.1717', '33.9202',
        '112.5'],
       ['20190511', '36.798', '-121.8462', '101.3', '9.5935', '33.8561',
        '129.6'],
       ['20190511', '36.798', '-121.8462', '61.0', '10.6014', '33.6941',
        '191.1'],
       ['20190511', '36.798', '-121.8462', '40.6', '11.0791', '33.6917',
        '196.9']], dtype='<U18')

array(['7.730970739061131', '7.727680727079802', '7.762957925077376',
       '7.796313545797834', '7.894515063962825', '7.922467194824854'],
      dtype='<U18')

array([['20190529', '36.747', '-122.061', '500.84', '5.947', '34.222',
        '20.52'],
       ['20190529', '36.747', '-122.061', '500.28', '5.952999999999999',
        '34.218', '20.28'],
       ['20190529', '36.747', '-122.061', '499.68', '5.97', '34.216',
        '20.03'],
       ['20190529', '36.747', '-122.061', '499.0', '5.971', '34.216',
        '19.92'],
       ['20190529', '36.747', '-122.061', '498.36', '5.973', '34.216',
        '19.81'],
       ['20190529', '36.747', '-122.061', '497.64', '5.975', '34.216',
        '19.7']], dtype='<U18')

array(['7.537999999999999', '7.5375', '7.5363', '7.5365', '7.5363',
       '7.5366'], dtype='<U18')

array([['20160518', '36.81', '-121.818', '15.0', '13.34', '33.143',
        '272.8'],
       ['20160518', '36.81', '-121.818', '5.1', '13.96',
        '33.138000000000005', '273.9'],
       ['20160518', '36.81', '-121.818', '3.4', '14.0', '33.139',
        '274.0'],
       ['20160518', '36.795', '-121.848', '251.2', '7.89', '34.05',
        '67.8'],
       ['20160518', '36.795', '-121.848', '151.0', '8.62', '33.954',
        '92.6'],
       ['20160518', '36.795', '-121.848', '100.5', '8.87', '33.908',
        '104.4']], dtype='<U19')

array(['8.034862400919026', '8.040221578006799', '8.042729240792925',
       '7.614367250629397', '7.649846263031173', '7.666234175745569'],
      dtype='<U18')

In [None]:
# CONCATENATE INPUTS
def concatenate_prep_data(input_arrays, output_arrays, 
                          save_csv=False, save_txt=False):

  inputs = np.concatenate(input_arrays)
  outputs = np.concatenate(output_arrays)

  # TRANSFORM DATE AND PRESSURE INPUTS

  # method to help transform date
  def date_to_nth_day(the_date):
    date = pd.to_datetime(the_date)
    new_year_day = pd.Timestamp(year=date.year, month=1, day=1)
    day_of_the_year = (date - new_year_day).days + 1
    return day_of_the_year

  # loop through inputs and perform transformations
  for input in inputs:
    # adjust date
    date = input[0]
    frac_year = date_to_nth_day(date) / 365
    if dyear:
      input[0] = int(date[0:4]) + frac_year
    else:
      input[0] = math.sin(2*math.pi*frac_year) 
    # adjust pressure
    if w_pres:
      pres = float(input[3])
      input[3] = ( pres/20000 ) + ( 1 / ( 1 + math.exp(-1*pres / 300) )**3 ) 
    
  # if desired save arrays as .txt files
  if save_txt or save_csv: 

    file_modifier = ''
    if w_pres:
      file_modifier = 'pres_'
    if not dyear:
      file_modifier += 'sin_'

    if save_txt:
      input_header = 'DATE LATITUDE LONGITUDE CTDPRS CTDTMP CTDSAL OXYGEN'
      np.savetxt(training_dir + file_modifier + 'training_tpso_input.txt', inputs, 
                 fmt='%s', header=input_header)
      output_header = 'TOT_PH'
      np.savetxt(training_dir + file_modifier + 'training_ph_output.txt', outputs, 
                 fmt='%s', header=output_header)
    else:
      input_header = 'DATE, LATITUDE, LONGITUDE, CTDPRS, CTDTMP, CTDSAL, OXYGEN'
      np.savetxt(training_dir + file_modifier + 'training_tpso_input.csv', inputs, 
                 fmt='%s', delimiter=",", header=input_header)
      output_header = 'TOT_PH'
      np.savetxt(training_dir + file_modifier + 'training_ph_output.csv', outputs, 
                 fmt='%s', delimiter=",", header=output_header)
      
  return inputs, outputs

# GET INPUT OUTPUT ARRAYS

inputs, outputs = concatenate_prep_data((ship_input, glider_input, wcoa_input),
                                        (ship_output, glider_output, wcoa_output),
                                        save_csv=True)

display(inputs[0])
display(outputs[0])

array(['0.7748840413670407', '36.798', '-121.8462', '0.30116400653352504',
       '8.6075', '33.9971', '94.2'], dtype='<U19')

'7.730970739061131'

In [None]:
display(inputs.shape)

(1004534, 7)