Implement a multiple regression of Total Alkalinity based on Salinity, Temperature, and Latitude per Rheuban 2021.
Use both shell day and MWRA datasets

In [93]:
import pandas as pd
from sklearn import linear_model
import requests

In [103]:
mwra = pd.read_csv('../../data/concat/MWRA_seagrant_concat_batch.csv', parse_dates = ['PROF_DATE_TIME_LOCAL'])

max_lat = max(mwra['LATITUDE'])
min_lat = min(mwra['LATITUDE'])

## Shell Day dataset

In [104]:
shell_day = pd.read_csv('../../data/Shell_Day_2019/Shell_Day_2019_Data.csv')

In [105]:
# Remove Bad Temp data
shell_day = shell_day[shell_day['Water Temp (degC)'] >= 0]

# Remove Bad Salinity data
shell_day = shell_day[shell_day['Field Salinity (PSU)'] >= 0]

# Remove bad TA data
shell_day  = shell_day[shell_day['TA (umol/kg)'] >= 0]

# Remove bad depth data
# If the value of Sample Depth (m) is 'Surface' replace with 0, convert values to float
#shell_day['Sample Depth (m)'] = shell_day['Sample Depth (m)'].replace('Surface', 0).astype(float)
#shell_day = shell_day[shell_day['Sample Depth (m)'] >= 0]
#shell_day = shell_day[shell_day['Water Depth (m)'] >= 0]

# Isolate Gulf of Maine Data
shell_day = shell_day[shell_day['Latitude - N'] >= min_lat] # southern bound
shell_day  = shell_day[shell_day['Station Name'] != 'Taylor Point'] # buzzards bay
shell_day = shell_day[shell_day['Station Name'] != 'WR2X'] # buzzards bay

# Compute depth coeff
#shell_day['Depth_coeff'] = shell_day['Sample Depth (m)'] / shell_day['Water Depth (m)']
print(len(shell_day))

15


In [106]:
# Define independent variables
X = shell_day[['Latitude - N', 'Longitude - W', 'Water Temp (degC)', 'Field Salinity (PSU)']]

# Define dependent variable
y = shell_day['TA (umol/kg)']

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X, y)

# The coefficients
print('Coefficients: ', regr.coef_) # [-114.53771149  -26.97219455  -43.94021411   37.85773728]
print('Intercept: ', regr.intercept_) # 4873.465086242284
print('R^2: ', regr.score(X, y)) # 0.7536012435717213

Coefficients:  [624.88727009 559.98089887 -41.87287876  26.87794813]
Intercept:  15406.452816656023
R^2:  0.8829880213803615


## MWRA dataset

In [115]:
mwra = pd.read_csv('../../data/concat/MWRA_seagrant_concat_batch.csv', parse_dates = ['PROF_DATE_TIME_LOCAL'])

In [116]:
# Clean the data
mwra = mwra[
  (mwra['TEMP (C)'].notnull()) &
   (mwra['SAL (PSU)'].notnull()) &
    (mwra['TA in (mmol/kgSW)'].notnull()) &
      (mwra['DEPTH (m)'].notnull()) &
        (mwra['LATITUDE'].notnull()) &
          (mwra['LONGITUDE'].notnull()) &
          (mwra['PROF_DATE_TIME_LOCAL'].notnull())
    ]
# millisecond column
mwra['MS'] = [dt.timestamp() - 1451653261 for dt in mwra['PROF_DATE_TIME_LOCAL']]

In [117]:
# Define independent variables
X = mwra[['LATITUDE', 'LONGITUDE', 'TEMP (C)', 'SAL (PSU)', 'MS']]
# Define dependent variable
y = mwra['TA in (mmol/kgSW)']

# Create the linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X, y)

# The coefficients
print('Coefficients: ', regr.coef_) # [-6.77573277  0.66802884 -0.43215554 45.99332592]
print('Intercept: ', regr.intercept_) # 1035.8773690253665
print('R^2: ', regr.score(X, y)) # 0.8573989356359999

Coefficients:  [ 2.69942588e+00 -2.34630781e+00 -3.84368045e-01  4.63884504e+01
  2.31985394e-07]
Intercept:  389.90009468982885
R^2:  0.8628700907439656


### Incorporating tide data from Boston NOAA station (8443970)

In [101]:
# Get unique days in the mwra data
#days = mwra['PROF_DATE_TIME_LOCAL']
days = list(set([dt.strftime("%Y%m%d") for dt in mwra['PROF_DATE_TIME_LOCAL']]))

In [None]:
requests.get()

In [102]:
days

['20190821',
 '20170906',
 '20180515',
 '20170613',
 '20190516',
 '20190606',
 '20191030',
 '20191010',
 '20190320',
 '20180718',
 '20171101',
 '20180724',
 '20190702',
 '20190717',
 '20170726',
 '20180622',
 '20180719',
 '20190515',
 '20180206',
 '20181024',
 '20190815',
 '20180410',
 '20180809',
 '20180927',
 '20170516',
 '20171020',
 '20181130',
 '20171023',
 '20191111',
 '20170424',
 '20190205',
 '20171011',
 '20180905',
 '20170823']