<a href="https://colab.research.google.com/github/adamggibbs/marine-carbonate-system-ml-prediction/blob/master/Model_Estimations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Estimation

This notebook uses a defined TensorFlow model to estimate a specified output variable from specified input variables. The notebook will take in an arbitrary number of data files, make estimations, and add the estimations to the data files as a new column, and save a new data file with the estimations.

**Directions:**

1. Place data files in the `to_estimate` directory.
2. Define the necessary user defined varibales including model name, model file path, input and output variables, type of input file, and desired type of output file.
3. Run the notebook by going to `Runtime -> Run all` or using `ctrl+F9`. All output files with be saved in the `estimations` directory.





In [25]:
#@title # Set up environment.
# SET UP ENVIRONMENT 

# mount google drive for data storage and access
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# IMPORTS

import os

import pandas as pd
import numpy as np

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import metrics
from tensorflow.keras.layers.experimental import preprocessing

Mounted at /content/drive


## Necessary User Defined Variables

In [26]:
# name of folder in Google Drive 
# should be a relative path from 'My Drive' and end with trailing '/'
# exclude beginning '/'
root_dir_name = 'Example/'

# desired name of trained model
model_name = 'model_name'
model_file_name = 'model_name_Layers(48, 24)'

input_vars = ['DATE', 'LATITUDE', 'LONGITUDE', 'PRS', 'TMP', 'SAL', 'OXYGEN']
output_var = 'PH_INSITU'

# what file type you're using,
# 'csv' for any comma separated value, 'txt' for any tab separated value
input_file_type = 'csv'

# save options for intermediate data files
# can save both or either .txt and .csv files
save_txt = True
save_csv = True


In [27]:
#@title ### Initialize directory variables.
# data dirs
root_dir = '/content/drive/MyDrive/'+ root_dir_name
data_dir = root_dir + 'data/'
est_dir = data_dir + 'to_estimate/'
est_results_dir = data_dir + 'estimations/'
model_dir = root_dir + 'models/'
fig_dir = root_dir + 'figs/'

model_path = model_dir + model_file_name

In [28]:
#@title ### Initialize data cleaning preprocessing functions.
# FUNCTION TO READ GLIDER FILE IN A PANDAS DATAFRAME
def read_data_file(file):
  names = [ 'Cruise', 'Station', 'Type',	'DATE', 'TIME', 'LONGITUDE',	
         'LATITUDE',	'QF',	'PRS', 'PRS_QF', 'TMP', 'TMP_QF', 
         'SAL', 'SAL_QF', 'Sigma_theta', 'ST_QF', 'DEPTH', 'DEPTH_QF', 
         'OXYGEN', 'OXYGEN_QF',	'SATOXY',	'SATOXY_QF',	'NITRATE', 
         'NITRATE_QF', 'CHL_A', 'CHL_A_QF', 'BBP700', 'BBP700_QF', 'PH_INSITU', 
         'PH_INSITU_QF', 'BBP532', 'BBP_532_QF', 'CDOM', 'CDOM_QF', 'TALK_CANYONB',	
         'TALK_QF', 'DIC_CANYONB', 'DIC_QF', 'pCO2_CANYONB', 'pCO2_QF', 
         'SAT_AR_CANYONB', 'SAT_AR_QF', 'pH25C_1atm', 'pH25C_1atm_QF' ]

  if input_file_type == 'csv':
    df = pd.read_csv(file, header=0, sep=',')
  else:
    df = pd.read_csv(file, skiprows=7, header=None, sep='\t', names=names)
  
  df = df.dropna(axis=0, how='any')
  
  return df

################################################################################

# CREATE FUNCTION TO CREATE A NUMPY ARRAY OF INPUTS FROM
# GLIDER DATA FILE

'''
process_data_file()
  description:
    This function reads in a data file in csv format and
    creates a pandas dataframe from it. From there it loops through
    and removes all bad data points according to the quality control
    flags. It then takes the desired input parameters as sepcified on 
    line 84 and puts them into a numpy array.

  args:
    file: string that contains file name of dataset
'''
def process_data_file(file, save_txt=False, save_csv=False):

  df = read_data_file(file)

  # decide data we care about
  relevant_vars = []
  # add input vars and their QFs
  for var in input_vars:
    relevant_vars.append(var)
    if var not in ['DATE', 'LATITUDE', 'LONGITUDE']:
      relevant_vars.append(var + '_QF')
  # take only data we care about
  df = df[relevant_vars]

  for input_var in input_vars:
    if input_var in ['DATE', 'LATITUDE', 'LONGITUDE']:
      continue
    # drop bad inputs
    to_drop = []
    for index, row in df.iterrows():
      if int(row[input_var + '_QF']) > 0:
        to_drop.append(index)
    df = df.drop(to_drop)
    # index = 0
    # to_drop = []
    # for flag in df[input_var + '_QF']:
    #   if int(flag) > 0:
    #     to_drop.append(index)
    #   index += 1
    # df = df.drop(to_drop).reset_index(drop=True)

  # take subset of only parameters for inputs
  # this array contains only "good" data points
  inputs = df[input_vars]
  # convert dataframe in numpy array
  indices = df.index.to_numpy()
  inputs = inputs.to_numpy(dtype='str')
  # inputs = np.concatenate((indices.T, inputs), axis=1)
  # change date format
  for row in inputs:
    date = row[0]
    row[0] = date[6:10] + date[0:2] + date[3:5]

  # return the array
  return inputs, indices

################################################################################

def prep_data(inputs):

  # TRANSFORM DATE AND PRESSURE INPUTS

  # method to help transform date
  def date_to_nth_day(the_date):
    date = pd.to_datetime(the_date)
    new_year_day = pd.Timestamp(year=date.year, month=1, day=1)
    day_of_the_year = (date - new_year_day).days + 1
    return day_of_the_year

  # loop through inputs and perform transformations
  for input in inputs:
    # adjust date
    date = input[0]
    frac_year = date_to_nth_day(date) / 365.0
    input[0] = int(date[0:4]) + frac_year 
    
  return inputs

In [29]:
#@title ### Get input data.
input_arrays = []
indices_arrays = []

for file in os.listdir(est_dir):
  print("Processing " + est_dir + file + " ... ", end="")

  curr_inputs, indices = process_data_file(est_dir+file)
  indices_arrays.append(indices)
  curr_inputs = prep_data(curr_inputs)
  input_arrays.append(curr_inputs)

  print("Complete.")


Processing /content/drive/MyDrive/Example/data/to_estimate/19A02901.csv ... Complete.
Processing /content/drive/MyDrive/Example/data/to_estimate/19502902.csv ... Complete.


In [30]:
#@title ### Load in model.
# LOAD IN MODEL

print("Loading in model... ", end="")
model = tf.keras.models.load_model(model_path, compile=True)
print("Complete.\nModel Summary:\n")
model.summary()


Loading in model... Complete.
Model Summary:

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_2 (Normalizat  (None, 7)                15        
 ion)                                                            
                                                                 
 dense_6 (Dense)             (None, 48)                384       
                                                                 
 dense_7 (Dense)             (None, 24)                1176      
                                                                 
 dense_8 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1,600
Trainable params: 1,585
Non-trainable params: 15
_________________________________________________________________


In [31]:
#@title ### Make estimations and save to data files.

input_array = 0
indices_array = 0
for file in os.listdir(est_dir):
  print("Loading " + est_dir + file + " ... ", end="")
  df = read_data_file(est_dir+file)
  print("Complete.")

  curr_inputs = input_arrays[input_array].astype('float')

  print("Making estimations for " + output_var + " ... ")
  predictions = model.predict(curr_inputs, verbose=1).flatten()
  print("Complete.")

  print("Adding estimations to data file... ",end="")
  preds = []
  input_inds = 0
  for index, row in df.iterrows():
    if index == indices_arrays[indices_array][input_inds]:
      preds.append(predictions[input_inds])
      input_inds += 1
    else:
      preds.append(np.nan)

  preds = np.array(preds)
  df[model_name + '_' + output_var] = preds
  print("Complete.")

  print("Saving file... ", end="")
  if save_csv:
    df.to_csv(est_results_dir + 'Estimated ' + file[:-4] + '.csv', index=False)
  if save_txt:
    df.to_csv(est_results_dir + 'Estimated ' + file[:-4] + '.txt', index=False, sep='\t')
  print("Complete.\n")

  input_array += 1
  indices_array += 1

print("\nAll files processed.")

Loading /content/drive/MyDrive/Example/data/to_estimate/19A02901.csv ... Complete.
Making estimations for PH_INSITU ... 
Complete.
Adding estimations to data file... Complete.
Saving file... Complete.

Loading /content/drive/MyDrive/Example/data/to_estimate/19502902.csv ... Complete.
Making estimations for PH_INSITU ... 
Complete.
Adding estimations to data file... Complete.
Saving file... Complete.


All files processed.
