In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from joblib import Parallel, delayed
from tqdm import tqdm
from itertools import product
from itertools import permutations
from itertools import combinations
from pyEDM import *
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import time
import os
from copy import deepcopy
import math
import random
from sklearn.metrics import root_mean_squared_error
from scipy.stats import ttest_ind
import pickle
import ast
import json



from IPython.display import display, HTML
display(HTML('<style>.container { width:90% !important; }</style>'))

import warnings
warnings.filterwarnings("ignore", 
    message="A worker stopped while some jobs were given to the executor.",
    module="joblib.externals.loky.process_executor")

In [130]:
def get_block(data, num_lags=1, tau=1):
    ''' Get a dataframe with all the possible valid lags of the variables. '''
    
    block = pd.concat([data[var].shift(lag*tau).rename(f'{var}(t-{lag*tau})') for lag in range(num_lags+1) for var in data.columns], axis=1)

    return block
def get_xmap_results_smap(block, target, embeddings, Tp, theta, lib, pred):
    '''Function to do exhaustive search of embeddings.'''
    
    def compute_rho(block, target, embedding, Tp, theta, lib, pred):
        xmap = SMap(dataFrame=block, target=target, columns=embedding, Tp=Tp, theta=theta, embedded=True, lib=lib, pred=pred, noTime=True)
        rho = xmap['predictions'][['Observations', 'Predictions']].corr().iloc[0,1]
        return embedding, xmap['predictions'], rho

    xmap_results = pd.DataFrame(columns=['embedding', 'rho'])
    xmap_results = Parallel(n_jobs=-1)(delayed(compute_rho)(block, target, embedding, Tp, theta, lib, pred) for embedding in embeddings)
    xmap_results = pd.DataFrame(xmap_results, columns=['embedding', 'result', 'rho'])
    xmap_results = xmap_results.sort_values(by='rho', ascending=False).reset_index(drop=True)
    
    return xmap_results

def get_valid_lags_tau(block, target, tau, num_lags, system_variables):
    
    # Get lags of system variables
    system_variable_lags = []
    for var in system_variables:
        var_lags = [f'{var}(t{i})' if i < 0 else f'{var}(t-{i})' for i in range(num_lags * tau, 1)]
        var_lags = var_lags[::tau][:num_lags+1]
        system_variable_lags = system_variable_lags + var_lags
    
    # Remove (t-0) lag of target variable from valid_lags
    valid_lags = [x for x in system_variable_lags if x[-4:-1] != 't-0']

                    
    return valid_lags


def create_single_model(E,theta,target,i_cols,lib, pred,HAB_embed,showPlot=False):
    driver = f'{target}(t-0)'
    cols = i_cols
    result = SMap(
        dataFrame = HAB_embed, 
        columns = cols,
        target = driver,
        lib = lib,  # Library from rows 0 to 700
        pred = pred,
        E = E+1,
        theta=theta,
        noTime=True,
        showPlot = showPlot,
        embedded=True,
        ignoreNan = True
    )
    return result

def thresh_bloom_binary_prediction(obs,pred,threshold=8.03199999999999):
    #obs_bloom_95 = np.percentile(obs, 95) #incorrect
    #pred_bloom_95 = np.percentile(pred, 95) #incorrect
    obs_blooms = obs > threshold
    pred_blooms = pred > threshold
    Accuracy = 1 - (obs_blooms ^ pred_blooms).mean()
    True_pos = (obs_blooms & pred_blooms).sum() / obs_blooms.sum()
    False_pos = ((~obs_blooms) & pred_blooms).sum() / (~obs_blooms).sum()
    True_neg = ((~obs_blooms) & (~pred_blooms)).sum() / (~obs_blooms).sum()
    False_neg = (obs_blooms & (~pred_blooms)).sum() / obs_blooms.sum()
    
    return [Accuracy, True_pos, False_pos, True_neg, False_neg]


def create_model(data,params,system_variables,target,Tp,num_lags,tau,theta_list,lib,pred,ensemble_sz=300):
    HAB_embed_block = get_block(data,50)
    parameters = pd.DataFrame(columns=['target', 'columns', 'E', 'theta', 'pred'])
    for i in range(ensemble_sz):
        E = params['E'].iloc[i]
        theta = params['theta'].iloc[i]
        embedding = params['columns'].iloc[i]
        smap_model = create_single_model(E,theta,target,embedding,lib, pred,HAB_embed_block,showPlot=False)
        df = smap_model['predictions']
        #bbp = thresh_bloom_binary_prediction(df['Observations'].iloc[1:-1],df['Predictions'].iloc[1:-1])

        new_row = {'target': target, 'columns': embedding + [f'{target} (t-0)'], 'E': E,'theta':theta, 'pred':df['Predictions']}
        parameters.loc[len(parameters)] = new_row

    return parameters

In [131]:
def ensemble_binary_bloom(parameters_df,n=300,p=0.05,samp=1,bloom_thresh=8.013):
    parameters_df = parameters_df.iloc[:n*samp].sample(n)
    sum = np.zeros(np.array(parameters_df['pred'].iloc[0][1:]).size)
    for i in range(n):
        curr = np.array(parameters_df['pred'].iloc[i][1:]) > bloom_thresh#np.percentile(parameters_df['pred'].iloc[i].iloc[1:],95)#
        sum = sum + curr
    return sum > (n*p)

'''
@parameters
data (dataframe) - data containing column for target and desired system variables'
params (dataframe) - data containing info for Smap models
system variables (list of strings) - variables contained in system to be used for prediction
target (string) - variable that will be forecasted (also used in prediction)
Tp - IDK
E - number of lags we go back
tau (int) - < 0 step size
theta_list (list of ints) - list of theta values to be used in ensemble model
lib - library used for prediction
pred - prediction length

@return
returns forecast for next time step given the dataframe 
'''

def next_forecast(data,params,system_variables,target,Tp,E,tau,theta_list,p=0.05):
    lib = '1 ' + str(data.shape[0] - 32) 
    pred = '' + str(data.shape[0] - 31) + ' ' + str(data.shape[0])
    parameters = create_model(data,params,system_variables,target,Tp,E,tau,theta_list,lib,pred)
    preds = ensemble_binary_bloom(parameters,n=300,p=0.05,samp=1,bloom_thresh=8.013)
    return preds
    


# Testings

In [132]:

paper_data = pd.read_csv('Data/data_w_gaps_and_wind.csv')
paper_data = paper_data.set_index('time')
paper_data['Time'] = paper_data.index.astype(int)
paper_data['Avg_Chloro'] #= paper_data['Avg_Chloro'].apply(np.log1p) #LOG AMPUTATION
#IMPUTE HAB DATA
#Build basic linear regression model as sanity check
# Custom impute missing values with the average of the value in front and behind of it 
class ForwardBackwardImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_filled_forward = X.fillna(method='ffill').fillna(method='bfill')
        X_filled_backward = X.fillna(method='bfill').fillna(method='ffill')

        return (X_filled_forward + X_filled_backward) / 2


Imputer = ForwardBackwardImputer()
paper_data = paper_data.apply(pd.to_numeric, errors='coerce')
Imputer.fit(paper_data)
paper_data = Imputer.transform(paper_data)#COMMENT OUT IF DONT WANT MEAN MPUTE
paper_data
HAB_embed = get_block(paper_data,50)
paper_data

  X_filled_forward = X.fillna(method='ffill').fillna(method='bfill')
  X_filled_backward = X.fillna(method='bfill').fillna(method='ffill')


Unnamed: 0_level_0,Avg_Chloro,Nitrate,Phosphate,Silicate,Nitrite,SURF_TEMP_C,BOT_TEMP_C,SURF_SAL_PSU,BOT_SAL_PSU,SURF_DENS_kgm3,BOT_DENS_kgm3,AVG_TEMP_C,AVG_SAL_PSU,AVG_DENS_kgm3,WSPD,Time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1.82,0.43,0.24,5.6,0.03,20.3,15.9,33.77,33.62,1023.638094,1024.620655,18.10,33.695,1024.129374,4.691667,0.0
7,2.71,0.83,0.37,5.1,0.04,17.8,16.9,33.66,33.62,1024.186379,1024.391719,17.35,33.640,1024.289049,2.787500,7.0
13,0.99,0.45,0.24,3.5,0.00,22.2,18.3,33.74,33.58,1023.100557,1024.026040,20.25,33.660,1023.563298,3.412500,13.0
20,1.23,0.34,0.17,4.2,0.00,19.8,19.4,33.68,33.62,1023.701088,1023.780535,9.90,33.650,1023.740812,3.495833,20.0
28,2.41,0.29,0.12,3.8,0.00,22.5,22.5,33.73,33.69,1023.009067,1023.000725,11.25,33.710,1023.004896,2.604167,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,3.22,0.85,0.30,6.2,0.00,17.4,17.4,33.48,33.46,1024.146195,1024.152950,17.40,33.470,1024.149572,5.076389,5789.0
5796,3.39,0.45,0.25,4.9,0.00,17.9,17.9,33.48,33.47,1024.026335,1024.040616,17.90,33.475,1024.033476,2.737500,5796.0
5803,7.82,0.00,0.17,5.2,0.00,18.3,16.4,33.54,33.55,1023.974093,1024.454252,17.35,33.545,1024.214172,3.306250,5803.0
5811,17.30,0.00,0.20,9.4,0.00,18.0,17.4,33.64,33.61,1024.122823,1024.266323,17.70,33.625,1024.194573,2.418056,5811.0


In [133]:
#Load model with 5000 samples
def str_to_list(s):
    s = s.replace('nan', 'null')  # Replace 'nan' with 'null' for JSON compatibility
    lst = json.loads(s)  # Convert string to list
    lst = [np.nan if x is None else x for x in lst]  # Replace None with np.nan
    return lst
parameters = pd.read_csv('Results/100_sample_baseline_models.csv') 
parameters['pred'] = parameters['pred'].apply(str_to_list)
parameters['columns'] = parameters['columns'].apply(ast.literal_eval)
parameters

Unnamed: 0.1,Unnamed: 0,target,columns,E,theta,rho,rmse,pred
0,0,Avg_Chloro,"[Avg_Chloro(t-3), BOT_DENS_kgm3(t-2), Nitrate(...",4,1,0.351217,8.740442,"[nan, 1.5397622978020922, 1.6435978792458217, ..."
1,1,Avg_Chloro,"[AVG_SAL_PSU(t-3), Nitrate(t-3), SURF_TEMP_C(t...",4,1,0.350209,8.742336,"[nan, 1.4650608284094104, 1.6796698529465162, ..."
2,2,Avg_Chloro,"[Avg_Chloro(t-2), Avg_Chloro(t-3), BOT_DENS_kg...",4,1,0.343179,8.904246,"[nan, 1.8407562301399036, 3.0543028297170265, ..."
3,3,Avg_Chloro,"[AVG_SAL_PSU(t-1), AVG_SAL_PSU(t-3), Nitrate(t...",4,1,0.365132,8.657377,"[nan, 1.5272993525312646, 1.617552121566141, 1..."
4,4,Avg_Chloro,"[Avg_Chloro(t-2), Avg_Chloro(t-3), BOT_DENS_kg...",4,1,0.343361,8.916747,"[nan, 1.952739458919968, 3.72602817897625, 1.6..."
...,...,...,...,...,...,...,...,...
6708,6708,Avg_Chloro,"[AVG_SAL_PSU(t-1), AVG_SAL_PSU(t-2), AVG_SAL_P...",15,9,0.362485,9.813514,"[nan, 1.4082415513465387, 2.340893156340498, 3..."
6709,6709,Avg_Chloro,"[AVG_SAL_PSU(t-1), AVG_SAL_PSU(t-2), AVG_SAL_P...",15,15,0.351395,10.057025,"[nan, 1.8970143945082638, 2.3193335139797866, ..."
6710,6710,Avg_Chloro,"[AVG_SAL_PSU(t-1), AVG_SAL_PSU(t-2), AVG_SAL_P...",15,25,0.300559,10.824238,"[nan, 3.0499775108242053, 2.1569862968918967, ..."
6711,6711,Avg_Chloro,"[AVG_SAL_PSU(t-1), AVG_SAL_PSU(t-2), AVG_SAL_P...",15,35,0.203449,11.906491,"[nan, 4.2076304346526525, 2.034049968469571, 4..."


In [134]:
data = paper_data
system_variables = ['AVG_SAL_PSU', 'Avg_Chloro', 'Nitrate', 'SURF_TEMP_C','BOT_DENS_kgm3']
target = 'Avg_Chloro'
system_variables = system_variables
Tp = 1
E = 3 # up to -2 
tau = -1
theta_list = [1,5,9,15,25,35,45]
lib = '1 800'  # Library from rows 0 to 700
pred = '801 832'


preds = next_forecast(data,parameters,system_variables,target,Tp,E,tau,theta_list)
preds

300it [00:03, 78.87it/s]                       


array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False, False, False,  True,  True])

In [135]:
#Eval
correct = 0
for i in range(700,832):
    preds = next_forecast(data.iloc[:i],parameters,system_variables,target,Tp,E,tau,theta_list)
    label = data['Avg_Chloro'].iloc[i] > 8.013
    if label == preds[-1]:
        correct += 1
        print('correct')
    else:
        print('incorrect')
acc = correct / len(range(700,833))
acc

300it [00:03, 91.51it/s]                       


correct


300it [00:03, 86.66it/s]                       


incorrect


300it [00:03, 79.00it/s]                       


incorrect


300it [00:03, 96.13it/s]                       


correct


300it [00:03, 77.57it/s]                       


correct


 89%|████████▉ | 75/84 [00:00<00:00, 78.10it/s]


KeyboardInterrupt: 