In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from joblib import Parallel, delayed
from tqdm import tqdm
from itertools import product
from itertools import permutations
from itertools import combinations
from pyEDM import *
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import time
import os
from copy import deepcopy
import math
import random
from sklearn.metrics import root_mean_squared_error
from scipy.stats import ttest_ind
import pickle
import ast
import json



from IPython.display import display, HTML
display(HTML('<style>.container { width:90% !important; }</style>'))

import warnings
warnings.filterwarnings("ignore", 
    message="A worker stopped while some jobs were given to the executor.",
    module="joblib.externals.loky.process_executor")

In [42]:
def get_block(data, num_lags=1, tau=1):
    ''' Get a dataframe with all the possible valid lags of the variables. '''
    
    block = pd.concat([data[var].shift(lag*tau).rename(f'{var}(t-{lag*tau})') for lag in range(num_lags+1) for var in data.columns], axis=1)

    return block
def get_xmap_results_smap(block, target, embeddings, Tp, theta, lib, pred):
    '''Function to do exhaustive search of embeddings.'''
    
    def compute_rho(block, target, embedding, Tp, theta, lib, pred):
        xmap = SMap(dataFrame=block, target=target, columns=embedding, Tp=Tp, theta=theta, embedded=True, lib=lib, pred=pred, noTime=True)
        rho = xmap['predictions'][['Observations', 'Predictions']].corr().iloc[0,1]
        return embedding, xmap['predictions'], rho

    xmap_results = pd.DataFrame(columns=['embedding', 'rho'])
    xmap_results = Parallel(n_jobs=-1)(delayed(compute_rho)(block, target, embedding, Tp, theta, lib, pred) for embedding in embeddings)
    xmap_results = pd.DataFrame(xmap_results, columns=['embedding', 'result', 'rho'])
    xmap_results = xmap_results.sort_values(by='rho', ascending=False).reset_index(drop=True)
    
    return xmap_results

def get_valid_lags_tau(block, target, tau, num_lags, system_variables):
    
    # Get lags of system variables
    system_variable_lags = []
    for var in system_variables:
        var_lags = [f'{var}(t{i})' if i < 0 else f'{var}(t-{i})' for i in range(num_lags * tau, 0)]
        var_lags = var_lags[::tau][:num_lags]
        system_variable_lags = system_variable_lags + var_lags
    
    # Remove (t-0) lag of target variable from valid_lags
    valid_lags = [x for x in system_variable_lags if x[-4:-1] != f'{target}(t-1)']

    print(f'Valid lags are {valid_lags}')        
    return valid_lags


def create_single_model(E,theta,target,i_cols,lib, pred,HAB_embed,showPlot=False):
    driver = f'{target}(t-0)'
    #print(i_cols)
    cols = i_cols + [f'{target}(t-1)']
    #print(cols)
    result = SMap(
        dataFrame = HAB_embed, 
        columns = cols,
        target = driver,
        lib = lib,  # Library from rows 0 to 700
        pred = pred,
        E = E+1,
        theta=theta,
        noTime=True,
        showPlot = showPlot,
        embedded=True,
        ignoreNan = True
    )
    return result

def thresh_bloom_binary_prediction(obs,pred,threshold=8.03199999999999):
    #obs_bloom_95 = np.percentile(obs, 95) #incorrect
    #pred_bloom_95 = np.percentile(pred, 95) #incorrect
    obs_blooms = obs > threshold
    pred_blooms = pred > threshold
    Accuracy = 1 - (obs_blooms ^ pred_blooms).mean()
    True_pos = (obs_blooms & pred_blooms).sum() / obs_blooms.sum()
    False_pos = ((~obs_blooms) & pred_blooms).sum() / (~obs_blooms).sum()
    True_neg = ((~obs_blooms) & (~pred_blooms)).sum() / (~obs_blooms).sum()
    False_neg = (obs_blooms & (~pred_blooms)).sum() / obs_blooms.sum()
    
    return [Accuracy, True_pos, False_pos, True_neg, False_neg]


def create_model(data,system_variables,target,Tp,num_lags,tau,theta_list,lib,pred):
    HAB_embed_block = get_block(data,50)
    valid_lags = get_valid_lags_tau(HAB_embed_block, target, tau, num_lags, system_variables)
    HAB_embeddings = {}
    for E in range(3,8):
        # Get random embeddings using valid lags
        embeddings = set()
        sample = 800
        max_trials = 800
        trials = 0
        while len(embeddings) < sample and trials < max_trials:
            embedding = tuple(random.sample(valid_lags, E))
            sorted_embedding = tuple(sorted(embedding))
            if sorted_embedding not in embeddings:
                embeddings.add(sorted_embedding)
            trials += 1
        embeddings = [list(embedding) for embedding in embeddings]
        HAB_embeddings['{0}'.format((target, E))] = embeddings
        E_list = range(3,8)


        
    total_iterations = len(E_list) * len(theta_list)

    parameters = pd.DataFrame(columns=['target', 'columns', 'E', 'theta', 'rho', 'rmse', 'pred'])

    with tqdm(total=total_iterations) as pbar:
        for E, theta in product(E_list, theta_list):

            key = [key for key in HAB_embeddings.keys() if eval(key)[0] == target and eval(key)[1] == E] #HAB_embeddings["('Avg_Chloro', 4, 0, 6)"][0]
            embeddings = HAB_embeddings[key[0]]
            for embedding in embeddings:
                smap_model = create_single_model(E,theta,target,embedding,lib, pred,HAB_embed_block,showPlot=False)
                df = smap_model['predictions']
                rho = df[['Observations', 'Predictions']].corr().iloc[0,1]
                rmse = root_mean_squared_error(df['Observations'].iloc[1:-1], df['Predictions'].iloc[1:-1])
                #bbp = thresh_bloom_binary_prediction(df['Observations'].iloc[1:-1],df['Predictions'].iloc[1:-1])

                new_row = {'target': target, 'columns': embedding, 'E': E,'theta':theta, 'rho':rho, 'rmse':rmse, 'pred':df['Predictions']}
                parameters.loc[len(parameters)] = new_row

            

            pbar.update(1)
    parameters.sort_values(by='rho',ascending=False)
    return parameters

In [43]:
def ensemble_binary_bloom(parameters_df,n=300,p=0.05,samp=1,bloom_thresh=8.013):
    parameters_df = parameters_df.iloc[:n*samp].sample(n)
    sum = np.zeros(np.array(parameters_df['pred'].iloc[0][1:]).size)
    for i in range(n):
        curr = np.array(parameters_df['pred'].iloc[i][1:]) > bloom_thresh#np.percentile(parameters_df['pred'].iloc[i].iloc[1:],95)#
        sum = sum + curr
    return sum > (n*p)

'''
@parameters
data - dataframe of data containing column for target and desired system variables'
system variables - variables contained in system to be used for prediction
target - variable that will be forecasted (also used in prediction)
Tp - IDK
E - number of lags we go back
tau - step size that we go back at
theta_list - list of theta values to be used in ensemble model
lib - library used for prediction
pred - prediction length

@return
returns forecast for next time step given the dataframe 
'''

def next_forecast(data,system_variables,target,Tp,E,tau,theta_list,lib,pred,p=0.05):
    parameters = create_model(data,system_variables,target,Tp,E,tau,theta_list,lib,pred)
    return parameters
    


# Testings

In [44]:

paper_data = pd.read_csv('Data/carter_data_w_gaps.csv') #DESIRED DATAPATH

paper_data = paper_data.set_index('time')
paper_data['Time'] = paper_data.index.astype(int)
paper_data['Avg_Chloro'] #= paper_data['Avg_Chloro'].apply(np.log1p) #LOG AMPUTATION
#IMPUTE HAB DATA
#Build basic linear regression model as sanity check
# Custom impute missing values with the average of the value in front and behind of it 
class ForwardBackwardImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_filled_forward = X.fillna(method='ffill').fillna(method='bfill')
        X_filled_backward = X.fillna(method='bfill').fillna(method='ffill')

        return (X_filled_forward + X_filled_backward) / 2


Imputer = ForwardBackwardImputer()
paper_data = paper_data.apply(pd.to_numeric, errors='coerce')
Imputer.fit(paper_data)
paper_data = Imputer.transform(paper_data)#COMMENT OUT IF DONT WANT MEAN MPUTE
paper_data
HAB_embed = get_block(paper_data,50)
HAB_embed

  X_filled_forward = X.fillna(method='ffill').fillna(method='bfill')
  X_filled_backward = X.fillna(method='bfill').fillna(method='ffill')


Unnamed: 0_level_0,Unnamed: 0(t-0),Akashiwo_sanguinea(t-0),CellCountDetection_Limit(t-0),Lingulodinium_polyedra(t-0),Total_Diatoms(t-0),Total_Dinoflagellates(t-0),Total_Phytoplankton(t-0),Ammonium(t-0),Avg_Chloro(t-0),Avg_Phaeo(t-0),...,Total_Prorocentrum_spp(t-50),Total_Tripos(t-50),WSPD(t-50),index(t-50),SURF_DENS_kgm3(t-50),BOT_DENS_kgm3(t-50),AVG_TEMP_C(t-50),AVG_SAL_PSU(t-50),AVG_DENS_kgm3(t-50),Time(t-50)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,89.0,89.0,69952.0,15684.0,85636.0,0.530,1.82,0.60,...,,,,,,,,,,
1,1.0,44.5,89.0,89.0,118740.5,10693.5,129434.5,0.795,2.08,0.53,...,,,,,,,,,,
2,2.0,89.0,89.0,89.0,167529.0,5703.0,173233.0,1.060,2.71,1.18,...,,,,,,,,,,
3,3.0,89.0,89.0,178.0,117315.0,8109.0,125424.5,0.705,1.32,0.52,...,,,,,,,,,,
4,4.0,89.0,89.0,267.0,67101.0,10515.0,77616.0,0.350,0.99,0.55,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1597,1597.0,0.0,626.0,0.0,95122.0,63206.0,158328.0,0.410,3.38,1.00,...,64458.0,0.0,1.965278,1547.0,1023.943315,1024.037281,17.75,33.370,1023.990298,1547.0
1598,1598.0,0.0,626.0,5006.5,191182.5,114209.0,305391.5,0.410,6.75,0.88,...,28787.0,1252.0,5.076389,1548.0,1024.146195,1024.152950,17.40,33.470,1024.149572,1548.0
1599,1599.0,0.0,626.0,10013.0,287243.0,165212.0,452455.0,0.410,9.71,1.49,...,26909.5,626.0,3.731250,1549.0,1023.909736,1023.973271,18.40,33.515,1023.941504,1549.0
1600,1600.0,0.0,626.0,7197.0,193372.5,159266.5,352639.0,0.410,19.95,3.10,...,25032.0,0.0,2.737500,1550.0,1024.026335,1024.040616,17.90,33.475,1024.033476,1550.0


In [45]:
data = paper_data

system_variables = ['AVG_DENS_kgm3',
 'AVG_SAL_PSU',
 'Ammonium',
 'Avg_Chloro',
 'Avg_Phaeo',
 'BOT_SAL_PSU',
 'Phaeo1',
 'Phaeo2',
 'SURF_DENS_kgm3',
 'SURF_SAL_PSU',
 'SURF_TEMP_C',
 'Total_Dinoflagellates',
 'Total_Phytoplankton',
 'Total_Prorocentrum_spp'] #SYSTEM VARIABLES



target = 'Avg_Chloro' #TARGET
system_variables = system_variables
Tp = 1
E = 6 # up to -5 
tau = -1
theta_list = [1,5,9,15,25,35,45]
lib = '1 1012'
pred = '1013 1602'


preds = next_forecast(data,system_variables,target,Tp,E,tau,theta_list,lib,pred)
preds

Valid lags are ['AVG_DENS_kgm3(t-1)', 'AVG_DENS_kgm3(t-2)', 'AVG_DENS_kgm3(t-3)', 'AVG_DENS_kgm3(t-4)', 'AVG_DENS_kgm3(t-5)', 'AVG_DENS_kgm3(t-6)', 'AVG_SAL_PSU(t-1)', 'AVG_SAL_PSU(t-2)', 'AVG_SAL_PSU(t-3)', 'AVG_SAL_PSU(t-4)', 'AVG_SAL_PSU(t-5)', 'AVG_SAL_PSU(t-6)', 'Ammonium(t-1)', 'Ammonium(t-2)', 'Ammonium(t-3)', 'Ammonium(t-4)', 'Ammonium(t-5)', 'Ammonium(t-6)', 'Avg_Chloro(t-1)', 'Avg_Chloro(t-2)', 'Avg_Chloro(t-3)', 'Avg_Chloro(t-4)', 'Avg_Chloro(t-5)', 'Avg_Chloro(t-6)', 'Avg_Phaeo(t-1)', 'Avg_Phaeo(t-2)', 'Avg_Phaeo(t-3)', 'Avg_Phaeo(t-4)', 'Avg_Phaeo(t-5)', 'Avg_Phaeo(t-6)', 'BOT_SAL_PSU(t-1)', 'BOT_SAL_PSU(t-2)', 'BOT_SAL_PSU(t-3)', 'BOT_SAL_PSU(t-4)', 'BOT_SAL_PSU(t-5)', 'BOT_SAL_PSU(t-6)', 'Phaeo1(t-1)', 'Phaeo1(t-2)', 'Phaeo1(t-3)', 'Phaeo1(t-4)', 'Phaeo1(t-5)', 'Phaeo1(t-6)', 'Phaeo2(t-1)', 'Phaeo2(t-2)', 'Phaeo2(t-3)', 'Phaeo2(t-4)', 'Phaeo2(t-5)', 'Phaeo2(t-6)', 'SURF_DENS_kgm3(t-1)', 'SURF_DENS_kgm3(t-2)', 'SURF_DENS_kgm3(t-3)', 'SURF_DENS_kgm3(t-4)', 'SURF_DENS_kgm3(

100%|██████████| 35/35 [17:09:51<00:00, 1765.48s/it]  


Unnamed: 0,target,columns,E,theta,rho,rmse,pred
0,Avg_Chloro,"[Avg_Chloro(t-5), Avg_Phaeo(t-6), BOT_SAL_PSU(...",3,1,0.327749,59.733716,0 NaN 1 1.007286 2 1.11...
1,Avg_Chloro,"[AVG_DENS_kgm3(t-6), Avg_Chloro(t-5), Avg_Phae...",3,1,0.330430,59.634413,0 NaN 1 1.317484 2 1.49...
2,Avg_Chloro,"[AVG_SAL_PSU(t-6), Phaeo2(t-5), Total_Prorocen...",3,1,0.468604,55.082628,0 NaN 1 1.310831 2 1.32...
3,Avg_Chloro,"[Ammonium(t-6), BOT_SAL_PSU(t-1), SURF_SAL_PSU...",3,1,0.333619,59.671739,0 NaN 1 1.292633 2 1.31...
4,Avg_Chloro,"[AVG_DENS_kgm3(t-3), AVG_SAL_PSU(t-6), Phaeo1(...",3,1,0.099991,64.079201,0 NaN 1 1.541803 2 1.62...
...,...,...,...,...,...,...,...
27960,Avg_Chloro,"[Avg_Chloro(t-1), Avg_Phaeo(t-6), Phaeo1(t-2),...",7,45,0.025443,332.623002,0 NaN 1 1.722643 2 1.49732...
27961,Avg_Chloro,"[AVG_DENS_kgm3(t-5), Avg_Phaeo(t-3), BOT_SAL_P...",7,45,0.164570,98.833638,0 NaN 1 1.352004 2 1.26...
27962,Avg_Chloro,"[AVG_DENS_kgm3(t-6), AVG_SAL_PSU(t-1), AVG_SAL...",7,45,0.475450,54.932361,0 NaN 1 1.425519 2 0.92...
27963,Avg_Chloro,"[AVG_SAL_PSU(t-1), Ammonium(t-6), Phaeo1(t-4),...",7,45,0.114288,642.772786,0 NaN 1 1.996991 2 1.12...


In [46]:
len(preds['pred'].iloc[0])

591

In [47]:
preds['pred'] = preds['pred'].apply(list)
preds.to_csv('800_sample_no_test_carter_models.csv') #TO SAVE (BE CAREFUL ABOUT OVERWRITING)