<a href="https://colab.research.google.com/github/adamggibbs/marine-carbonate-system-ml-prediction/blob/master/data_quality_control.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Quality Check Notebook

This notebook performs quality checks for pH, pressure, salinity, oxygen, and temperature inputs.

# User Defined Variables

In [None]:
#@title # Set up environment. 

# IMPORTS 

# mount google drive for data storage and access
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os

import pandas as pd
import numpy as np


In [None]:
# STORE DIRECTORY WHERE DATA IS STORED

# name of folder in Google Drive 
# should be a relative path from 'My Drive' and end with trailing '/'
# exclude beginning '/'
root_dir_name = 'Example/'

## Store directory variables.

In [None]:
# data dirs
root_dir = '/content/drive/MyDrive/'+ root_dir_name
data_dir = root_dir + 'data/'
qc_dir=data_dir + 'quality_check/'

# Quality Checking Code

In [None]:
#@title # Define quality checking functions.
# FUNCTION TO READ GLIDER FILE IN A PANDAS DATAFRAME
def read_file(file):
  names = [ 'Cruise', 'Station', 'Type',	'DATE', 'TIME', 'LONGITUDE',	
         'LATITUDE',	'QF',	'PRS', 'PRS_QF', 'TMP', 'TMP_QF', 
         'SAL', 'SAL_QF', 'Sigma_theta', 'ST_QF', 'DEPTH', 'DEPTH_QF', 
         'OXYGEN', 'OXYGEN_QF',	'SATOXY',	'SATOXY_QF',	'NITRATE', 
         'NITRATE_QF', 'CHL_A', 'CHL_A_QF', 'BBP700', 'BBP700_QF', 'PH_INSITU', 
         'PH_INSITU_QF', 'BBP532', 'BBP_532_QF', 'CDOM', 'CDOM_QF', 'TALK_CANYONB',	
         'TALK_QF', 'DIC_CANYONB', 'DIC_QF', 'pCO2_CANYONB', 'pCO2_QF', 
         'SAT_AR_CANYONB', 'SAT_AR_QF', 'pH25C_1atm', 'pH25C_1atm_QF' ]

  df = pd.read_csv(file, skiprows=7, header=None, sep='\t', names=names)
  df = df.dropna(axis=0, how='any').reset_index(drop=True)
  
  return df

def check_qfs(file, save_csv=True):
  # read in csv
  dataframe = read_file(file)

  df = dataframe[[ 'PH_INSITU', 'PH_INSITU_QF', 'PRS', 'PRS_QF', 'TMP', 'TMP_QF', 
           'SAL', 'SAL_QF', 'OXYGEN', 'OXYGEN_QF']].to_numpy()

  for ind in range(0, len(df)):
    # check pH QFs
    pH = df[ind][0]
    qf = df[ind][1]
    if pH < 7.3 or pH > 8.5:
      df[ind][1] = 8
    elif ind > 1 and ind + 2 < len(df):
      test = pH - np.median((df[ind-2][0], df[ind-1][0], 
                              pH, 
                              df[ind+1][0], df[ind+2][0]))
      if abs(test) > 0.04:
        df[ind][1] = 8

    sal = df[ind][6]
    if sal < 32.5 or sal > 34.5:
      df[ind][7] = 8

    # check inputs QFs
    if float(df[ind][2]) == -999:
      df[ind][3] = 8
    if float(df[ind][4]) == -999:
      df[ind][5] = 8
    if float(df[ind][6]) == -999:
      df[ind][7] = 8
    if float(df[ind][8]) == -999:
      df[ind][9] = 8

  # update pH QFs
  dataframe['PH_INSITU_QF'] = df[:,1]
  # update inputs QFs
  dataframe['PRS_QF'] = df[:,3]
  dataframe['TMP_QF'] = df[:,5]
  dataframe['SAL_QF'] = df[:,7]
  dataframe['OXYGEN_QF'] = df[:,9]

  dataframe.to_csv(file[:-3] + 'csv', header=True, index=False)


In [None]:
  #@title # Run quality checks. 
  # loop through all data files and add them to input array
  print('Parsing the following files:\n')
  for file in os.listdir(qc_dir):
    print(qc_dir + file)
    check_qfs(qc_dir + file)
  print('\nComplete.')
