In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from joblib import Parallel, delayed
from tqdm import tqdm
from itertools import product
from itertools import permutations
from itertools import combinations
from pyEDM import *
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
import time
import os
import math
import random
from scipy.stats import ttest_ind
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from IPython.display import display, HTML
display(HTML('<style>.container { width:90% !important; }</style>'))

import warnings
warnings.filterwarnings("ignore", 
    message="A worker stopped while some jobs were given to the executor.",
    module="joblib.externals.loky.process_executor")

In [2]:
def get_block(data, num_lags=1, tau=1):
    ''' Get a dataframe with all the possible valid lags of the variables. '''
    
    block = pd.concat([data[var].shift(lag*tau).rename(f'{var}(t-{lag*tau})') for lag in range(num_lags+1) for var in data.columns], axis=1)

    return block

In [3]:
def ccm(interaction, block, E_list, tau_list, theta_list, Tp, sample=50, sig=0.05):
    #solver = HistGradientBoostingRegressor() #TRYING DIFFERNT SOLVER TO ENSURE CONVERGENCE



    print(interaction)
    lib = f'1 {len(block)}'
    
    # Get dataframe with two species of interest
    A = interaction[0]; B = interaction[1]
    df = block[[f'{A}(t-0)', f'{B}(t-0)']]
    
    driver = f'{A}(t-0)'    
    default_output = {
        'target (driver)': A,
        'lib (driven)': B,
        'E': None,
        'tau': None,
        'theta': None,
        'E_tau_theta_results': None,
        'ccm_value': None,
        'convergence_p_value': None,
        'correlation': None
    }
    
    E_tau_theta_results = pd.DataFrame(columns = ['E', 'tau', 'theta', 'rho'])
    for E, tau, theta in list(product(E_list, tau_list, theta_list)):
        driven_embedded = [f'{B}(t{i})' if i < 0 else f'{B}(t-{i})' for i in range(E * tau, 1)]
        driven_embedded = driven_embedded[::tau][:E]
        try:
            c = SMap(dataFrame=block, target=driver, columns=driven_embedded, embedded=True, Tp=Tp, theta=theta, lib=lib, pred=lib, noTime=True)
        except:
            print(f"{interaction} did not converge")
            return default_output
        c = c['predictions'][['Observations', 'Predictions']]
        rho = c.corr().iloc[0,1]
        E_tau_theta_results.loc[len(E_tau_theta_results)] = [E, tau, theta, rho]
    E_tau_theta_results = E_tau_theta_results.apply(pd.to_numeric, errors='coerce')

    # Assign E, tau, and theta to be the optimal E, tau, and theta
    ccm_value = E_tau_theta_results['rho'].max()
    E = int(E_tau_theta_results.loc[np.where(E_tau_theta_results.rho==ccm_value),'E'].item())
    tau = int(E_tau_theta_results.loc[np.where(E_tau_theta_results.rho==ccm_value),'tau'].item())
    theta = int(E_tau_theta_results.loc[np.where(E_tau_theta_results.rho==ccm_value),'theta'].item())
        
    # Get convergence p-value
    try:
        convergence_p_value = get_convergence_p_value(block, sample, A, B, E, Tp, tau, theta)
    except:
        print(f"{interaction} did not converge")
        return default_output

    # Preparing Output
    output = {
        'target (driver)': A,
        'lib (driven)': B,
        'E': E,
        'tau': tau,
        'theta': theta,
        'E_tau_theta_results': E_tau_theta_results,
        'ccm_value': ccm_value,
        'convergence_p_value': convergence_p_value,
        'correlation': df.corr().iloc[0,1]
    }

    return output

def get_convergence_p_value(df, sample, A, B, E, Tp, tau, theta):
    # Get convergence p-value for CCM (one-tailed t-test on cross-map values using 20% and 50% library sizes)
    # H0: μ_20% ≥ μ_50%
    # HA: μ_20% < μ_50%
    # If p < 0.05, the 20% library size trials have a rho that is significantly smaller than the 50% library trials  
    
    libsize1 = int(np.ceil(df.shape[0]/5))   # 20% of the full library size
    libsize2 = int(np.ceil(df.shape[0]/2))   # 50% of the full library size
    
    max_iterations = 10 * sample
    
    # Get list of rhos for libsize1
    rhos1 = []; iteration_count = 0
    while len(rhos1) < sample and iteration_count < max_iterations:
        start = np.random.randint(libsize1, len(df))
        library = [start - libsize1, start]
        data_subset = df.iloc[library[0]:library[1]]
        lib = f'{library[0]+1} {library[1]+1}'
        driver = f'{A}(t-0)'
        driven_embedded = [f'{B}(t{i})' if i < 0 else f'{B}(t-{i})' for i in range(E * tau, 1)]
        driven_embedded = driven_embedded[::tau][:E]
        c = SMap(dataFrame=block, target=driver, columns=driven_embedded, embedded=True, Tp=Tp, theta=theta, lib=lib, pred=lib, noTime=True)
        c = c['predictions'][['Observations', 'Predictions']]
        rho1 = c.corr().iloc[0,1]
        if not np.isnan(rho1):
            rhos1.append(rho1)
        iteration_count += 1
        
    # Get list of rhos for libsize2
    rhos2 = []; iteration_count = 0
    while len(rhos2) < sample and iteration_count < max_iterations:
        start = np.random.randint(libsize2, len(df))
        library = [start - libsize2, start]
        data_subset = df.iloc[library[0]:library[1]]
        lib = f'{library[0]+1} {library[1]+1}'
        driver = f'{A}(t-0)'
        driven_embedded = [f'{B}(t{i})' if i < 0 else f'{B}(t-{i})' for i in range(E * tau, 1)]
        driven_embedded = driven_embedded[::tau][:E]
        c = SMap(dataFrame=block, target=driver, columns=driven_embedded, embedded=True, Tp=Tp, theta=theta, lib=lib, pred=lib, noTime=True)
        c = c['predictions'][['Observations', 'Predictions']]
        rho2 = c.corr().iloc[0,1]
        if not np.isnan(rho2):
            rhos2.append(rho2)
        iteration_count += 1
    
    convergence_t_stat, convergence_p_value = ttest_ind(rhos1, rhos2, alternative='less')
    
    return convergence_p_value


In [4]:
HAB_data = pd.read_csv('Data/d1carter_data_w_gaps.csv', index_col=0) #LOAD DATA
HAB_data.columns

Index(['Temp', 'Chl1', 'Chl2', 'Avg_Chloro', 'Phaeo1', 'Phaeo2', 'Avg_Phaeo',
       'Nitrate', 'Phosphate', 'Silicate', 'Nitrite', 'Ammonium',
       'CellCountDetection_Limit', 'Total_Diatoms', 'Akashiwo_sanguinea',
       'Total_Tripos', 'Total_Cochlodinium_spp', 'Lingulodinium_polyedra',
       'Total_Prorocentrum_spp', 'Total_Dinoflagellates',
       'Total_Phytoplankton', 'DATE', 'SURF_SAL_PSU', 'BOT_SAL_PSU',
       'SURF_TEMP_C', 'BOT_TEMP_C', 'WSPD', 'time', 'SURF_DENS_kgm3',
       'BOT_DENS_kgm3', 'AVG_TEMP_C', 'AVG_SAL_PSU', 'AVG_DENS_kgm3'],
      dtype='object')

In [5]:
#HAB_data = pd.read_csv('Data/data_w_gaps_and_wind.csv', index_col=0)#.iloc[304:612] RANGE w/o missing values
HAB_data = HAB_data.reset_index()
HAB_data.columns = HAB_data.columns.str.replace(' ', '_')

# Put columns in alphabetical order
sorted_columns = sorted(HAB_data.columns)
HAB_data = HAB_data[sorted_columns]
HAB_data = HAB_data.set_index('time')
target = 'Lingulodinium_polyedra' #change target if needed

# Make indices integers and save mapping to dates
#date_to_int_map = {i: date for i, date in enumerate(HAB_data.index)}
#HAB_data.index = range(len(HAB_data))

HAB_data

Unnamed: 0_level_0,AVG_DENS_kgm3,AVG_SAL_PSU,AVG_TEMP_C,Akashiwo_sanguinea,Ammonium,Avg_Chloro,Avg_Phaeo,BOT_DENS_kgm3,BOT_SAL_PSU,BOT_TEMP_C,...,Silicate,Temp,Total_Cochlodinium_spp,Total_Diatoms,Total_Dinoflagellates,Total_Phytoplankton,Total_Prorocentrum_spp,Total_Tripos,WSPD,index
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1024.129374,33.695,18.10,0.0,0.53,1.82,0.60,1024.620655,33.62,15.9,...,5.6,19.8,0.0,69952.0,15684.0,85636.0,5168.0,,4.691667,0
1,1024.289049,33.640,17.35,89.0,1.06,2.71,1.18,1024.391719,33.62,16.9,...,5.1,17.3,0.0,167529.0,5703.0,173233.0,1158.0,,2.787500,1
2,1023.563298,33.660,20.25,89.0,0.35,0.99,0.55,1024.026040,33.58,18.3,...,3.5,21.7,267.0,67101.0,10515.0,77616.0,1069.0,,3.412500,2
3,1023.740812,33.650,9.90,0.0,0.55,1.23,0.48,1023.780535,33.62,19.4,...,4.2,20.2,0.0,24149.0,4456.0,28605.0,623.0,,3.495833,3
4,1023.004896,33.710,11.25,0.0,0.62,2.41,0.39,1023.000725,33.69,22.5,...,3.8,22.4,0.0,27357.0,4990.0,32347.0,1069.0,,2.604167,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,1024.585481,33.385,15.20,0.0,0.41,2.86,0.46,1024.636187,33.38,15.0,...,4.1,15.6,626.0,43180.0,92619.0,135799.0,2503.0,30664.0,2.525694,819
820,1024.698295,33.350,14.55,0.0,,1.50,0.60,1024.712299,33.34,14.5,...,,14.4,0.0,53819.0,36297.0,90116.0,626.0,1877.0,1.893750,820
821,1024.940230,33.370,13.45,0.0,,3.38,1.00,1024.953741,33.36,13.4,...,,13.8,0.0,95122.0,63206.0,158328.0,1252.0,26910.0,5.088889,821
822,,,,0.0,,9.71,1.49,,,,...,,13.2,0.0,287243.0,165212.0,452455.0,22529.0,75096.0,2.813194,822


In [6]:
HAB_data.columns

Index(['AVG_DENS_kgm3', 'AVG_SAL_PSU', 'AVG_TEMP_C', 'Akashiwo_sanguinea',
       'Ammonium', 'Avg_Chloro', 'Avg_Phaeo', 'BOT_DENS_kgm3', 'BOT_SAL_PSU',
       'BOT_TEMP_C', 'CellCountDetection_Limit', 'Chl1', 'Chl2', 'DATE',
       'Lingulodinium_polyedra', 'Nitrate', 'Nitrite', 'Phaeo1', 'Phaeo2',
       'Phosphate', 'SURF_DENS_kgm3', 'SURF_SAL_PSU', 'SURF_TEMP_C',
       'Silicate', 'Temp', 'Total_Cochlodinium_spp', 'Total_Diatoms',
       'Total_Dinoflagellates', 'Total_Phytoplankton',
       'Total_Prorocentrum_spp', 'Total_Tripos', 'WSPD', 'index'],
      dtype='object')

In [7]:
print(HAB_data.isna().sum())

AVG_DENS_kgm3                97
AVG_SAL_PSU                  94
AVG_TEMP_C                   95
Akashiwo_sanguinea           22
Ammonium                      5
Avg_Chloro                    1
Avg_Phaeo                     1
BOT_DENS_kgm3                95
BOT_SAL_PSU                  92
BOT_TEMP_C                   94
CellCountDetection_Limit     22
Chl1                          1
Chl2                          2
DATE                          0
Lingulodinium_polyedra       22
Nitrate                       4
Nitrite                       4
Phaeo1                        1
Phaeo2                        2
Phosphate                     4
SURF_DENS_kgm3               52
SURF_SAL_PSU                 50
SURF_TEMP_C                  51
Silicate                      4
Temp                          0
Total_Cochlodinium_spp       22
Total_Diatoms                22
Total_Dinoflagellates        22
Total_Phytoplankton          22
Total_Prorocentrum_spp       22
Total_Tripos                153
WSPD    

In [8]:
#IMPUTE HAB DATA
#Build basic linear regression model as sanity check
# Custom impute missing values with the average of the value in front and behind of it 
class ForwardBackwardImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_filled_forward = X.fillna(method='ffill').fillna(method='bfill')
        X_filled_backward = X.fillna(method='bfill').fillna(method='ffill')

        return (X_filled_forward + X_filled_backward) / 2


Imputer = ForwardBackwardImputer()
HAB_data = HAB_data.apply(pd.to_numeric, errors='coerce')
Imputer.fit(HAB_data)
HAB_data = Imputer.transform(HAB_data)#COMMENT OUT IF DONT WANT MEAN MPUTE
HAB_data

  X_filled_forward = X.fillna(method='ffill').fillna(method='bfill')
  X_filled_backward = X.fillna(method='bfill').fillna(method='ffill')


Unnamed: 0_level_0,AVG_DENS_kgm3,AVG_SAL_PSU,AVG_TEMP_C,Akashiwo_sanguinea,Ammonium,Avg_Chloro,Avg_Phaeo,BOT_DENS_kgm3,BOT_SAL_PSU,BOT_TEMP_C,...,Silicate,Temp,Total_Cochlodinium_spp,Total_Diatoms,Total_Dinoflagellates,Total_Phytoplankton,Total_Prorocentrum_spp,Total_Tripos,WSPD,index
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1024.129374,33.6950,18.100,0.0,0.53,1.82,0.60,1024.620655,33.62,15.90,...,5.6,19.8,0.0,69952.0,15684.0,85636.0,5168.0,0.0,4.691667,0.0
1,1024.289049,33.6400,17.350,89.0,1.06,2.71,1.18,1024.391719,33.62,16.90,...,5.1,17.3,0.0,167529.0,5703.0,173233.0,1158.0,0.0,2.787500,1.0
2,1023.563298,33.6600,20.250,89.0,0.35,0.99,0.55,1024.026040,33.58,18.30,...,3.5,21.7,267.0,67101.0,10515.0,77616.0,1069.0,0.0,3.412500,2.0
3,1023.740812,33.6500,9.900,0.0,0.55,1.23,0.48,1023.780535,33.62,19.40,...,4.2,20.2,0.0,24149.0,4456.0,28605.0,623.0,0.0,3.495833,3.0
4,1023.004896,33.7100,11.250,0.0,0.62,2.41,0.39,1023.000725,33.69,22.50,...,3.8,22.4,0.0,27357.0,4990.0,32347.0,1069.0,0.0,2.604167,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,1024.585481,33.3850,15.200,0.0,0.41,2.86,0.46,1024.636187,33.38,15.00,...,4.1,15.6,626.0,43180.0,92619.0,135799.0,2503.0,30664.0,2.525694,819.0
820,1024.698295,33.3500,14.550,0.0,0.41,1.50,0.60,1024.712299,33.34,14.50,...,4.1,14.4,0.0,53819.0,36297.0,90116.0,626.0,1877.0,1.893750,820.0
821,1024.940230,33.3700,13.450,0.0,0.41,3.38,1.00,1024.953741,33.36,13.40,...,4.1,13.8,0.0,95122.0,63206.0,158328.0,1252.0,26910.0,5.088889,821.0
822,1024.930831,33.3775,13.525,0.0,0.41,9.71,1.49,1024.951346,33.37,13.45,...,4.1,13.2,0.0,287243.0,165212.0,452455.0,22529.0,75096.0,2.813194,822.0


In [9]:
print(HAB_data.isna().sum())

AVG_DENS_kgm3                 0
AVG_SAL_PSU                   0
AVG_TEMP_C                    0
Akashiwo_sanguinea            0
Ammonium                      0
Avg_Chloro                    0
Avg_Phaeo                     0
BOT_DENS_kgm3                 0
BOT_SAL_PSU                   0
BOT_TEMP_C                    0
CellCountDetection_Limit      0
Chl1                          0
Chl2                          0
DATE                        824
Lingulodinium_polyedra        0
Nitrate                       0
Nitrite                       0
Phaeo1                        0
Phaeo2                        0
Phosphate                     0
SURF_DENS_kgm3                0
SURF_SAL_PSU                  0
SURF_TEMP_C                   0
Silicate                      0
Temp                          0
Total_Cochlodinium_spp        0
Total_Diatoms                 0
Total_Dinoflagellates         0
Total_Phytoplankton           0
Total_Prorocentrum_spp        0
Total_Tripos                  0
WSPD    

In [10]:
#STANDARDIZE
#LEFT OFF HERE THERE IS AN ERROR
'''
scaler = MinMaxScaler()
columns = ['AVG_DENS_kgm3', 'AVG_SAL_PSU', 'AVG_TEMP_C', 'Avg_Chloro', 
           'BOT_DENS_kgm3', 'BOT_SAL_PSU', 'BOT_TEMP_C', 'Nitrate', 'Nitrite', 
           'Phosphate', 'SURF_DENS_kgm3', 'SURF_SAL_PSU', 'SURF_TEMP_C', 
           'Silicate', 'WSPD']
HAB_data[columns] = scaler.fit_transform(HAB_data[columns])
HAB_data
'''

"\nscaler = MinMaxScaler()\ncolumns = ['AVG_DENS_kgm3', 'AVG_SAL_PSU', 'AVG_TEMP_C', 'Avg_Chloro', \n           'BOT_DENS_kgm3', 'BOT_SAL_PSU', 'BOT_TEMP_C', 'Nitrate', 'Nitrite', \n           'Phosphate', 'SURF_DENS_kgm3', 'SURF_SAL_PSU', 'SURF_TEMP_C', \n           'Silicate', 'WSPD']\nHAB_data[columns] = scaler.fit_transform(HAB_data[columns])\nHAB_data\n"

In [11]:
#HAB_data.drop(columns=[]) # FOR MELISSA CARTER DATA
block = get_block(HAB_data, num_lags=50, tau=1)
block

Unnamed: 0_level_0,AVG_DENS_kgm3(t-0),AVG_SAL_PSU(t-0),AVG_TEMP_C(t-0),Akashiwo_sanguinea(t-0),Ammonium(t-0),Avg_Chloro(t-0),Avg_Phaeo(t-0),BOT_DENS_kgm3(t-0),BOT_SAL_PSU(t-0),BOT_TEMP_C(t-0),...,Silicate(t-50),Temp(t-50),Total_Cochlodinium_spp(t-50),Total_Diatoms(t-50),Total_Dinoflagellates(t-50),Total_Phytoplankton(t-50),Total_Prorocentrum_spp(t-50),Total_Tripos(t-50),WSPD(t-50),index(t-50)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1024.129374,33.6950,18.100,0.0,0.53,1.82,0.60,1024.620655,33.62,15.90,...,,,,,,,,,,
1,1024.289049,33.6400,17.350,89.0,1.06,2.71,1.18,1024.391719,33.62,16.90,...,,,,,,,,,,
2,1023.563298,33.6600,20.250,89.0,0.35,0.99,0.55,1024.026040,33.58,18.30,...,,,,,,,,,,
3,1023.740812,33.6500,9.900,0.0,0.55,1.23,0.48,1023.780535,33.62,19.40,...,,,,,,,,,,
4,1023.004896,33.7100,11.250,0.0,0.62,2.41,0.39,1023.000725,33.69,22.50,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,1024.585481,33.3850,15.200,0.0,0.41,2.86,0.46,1024.636187,33.38,15.00,...,3.0,16.2,0.0,21903.0,23780.0,45683.0,1252.0,0.0,2.476389,769.0
820,1024.698295,33.3500,14.550,0.0,0.41,1.50,0.60,1024.712299,33.34,14.50,...,3.3,16.5,0.0,11264.0,60077.0,71341.0,3755.0,6884.0,2.687500,770.0
821,1024.940230,33.3700,13.450,0.0,0.41,3.38,1.00,1024.953741,33.36,13.40,...,2.9,17.4,0.0,26909.0,38800.0,65709.0,5632.0,2503.0,3.760417,771.0
822,1024.930831,33.3775,13.525,0.0,0.41,9.71,1.49,1024.951346,33.37,13.45,...,2.4,16.9,0.0,26909.0,65083.0,91992.0,6258.0,1878.0,3.914583,772.0


In [12]:
E_list = range(2,13)
tau_list = [-1,-2,-3] #can try more taus
theta_list = [0,0.1,0.5,1,2,3,4,5,6,7,8,9]
Tp = 0
exclusion_radius = 0

all_ccm_results = pd.DataFrame()
interactions = list(permutations(HAB_data.columns.tolist(),2))
print(interactions)
target_interactions = [pair for pair in interactions if target in pair]

interaction = target_interactions[0]
print(f'There are {len(target_interactions)} interactions')

results = Parallel(n_jobs=8)(
    delayed(ccm)(interaction, block, E_list, tau_list, theta_list, Tp) for interaction in target_interactions)
results_df = pd.DataFrame(results)

[('AVG_DENS_kgm3', 'AVG_SAL_PSU'), ('AVG_DENS_kgm3', 'AVG_TEMP_C'), ('AVG_DENS_kgm3', 'Akashiwo_sanguinea'), ('AVG_DENS_kgm3', 'Ammonium'), ('AVG_DENS_kgm3', 'Avg_Chloro'), ('AVG_DENS_kgm3', 'Avg_Phaeo'), ('AVG_DENS_kgm3', 'BOT_DENS_kgm3'), ('AVG_DENS_kgm3', 'BOT_SAL_PSU'), ('AVG_DENS_kgm3', 'BOT_TEMP_C'), ('AVG_DENS_kgm3', 'CellCountDetection_Limit'), ('AVG_DENS_kgm3', 'Chl1'), ('AVG_DENS_kgm3', 'Chl2'), ('AVG_DENS_kgm3', 'DATE'), ('AVG_DENS_kgm3', 'Lingulodinium_polyedra'), ('AVG_DENS_kgm3', 'Nitrate'), ('AVG_DENS_kgm3', 'Nitrite'), ('AVG_DENS_kgm3', 'Phaeo1'), ('AVG_DENS_kgm3', 'Phaeo2'), ('AVG_DENS_kgm3', 'Phosphate'), ('AVG_DENS_kgm3', 'SURF_DENS_kgm3'), ('AVG_DENS_kgm3', 'SURF_SAL_PSU'), ('AVG_DENS_kgm3', 'SURF_TEMP_C'), ('AVG_DENS_kgm3', 'Silicate'), ('AVG_DENS_kgm3', 'Temp'), ('AVG_DENS_kgm3', 'Total_Cochlodinium_spp'), ('AVG_DENS_kgm3', 'Total_Diatoms'), ('AVG_DENS_kgm3', 'Total_Dinoflagellates'), ('AVG_DENS_kgm3', 'Total_Phytoplankton'), ('AVG_DENS_kgm3', 'Total_Prorocentrum_

('Akashiwo_sanguinea', 'Lingulodinium_polyedra')
('AVG_TEMP_C', 'Lingulodinium_polyedra')
('AVG_DENS_kgm3', 'Lingulodinium_polyedra')
('AVG_SAL_PSU', 'Lingulodinium_polyedra')
('Avg_Chloro', 'Lingulodinium_polyedra')
('Avg_Phaeo', 'Lingulodinium_polyedra')
('BOT_DENS_kgm3', 'Lingulodinium_polyedra')
('Ammonium', 'Lingulodinium_polyedra')
('BOT_SAL_PSU', 'Lingulodinium_polyedra')
('BOT_TEMP_C', 'Lingulodinium_polyedra')
('CellCountDetection_Limit', 'Lingulodinium_polyedra')
('Chl1', 'Lingulodinium_polyedra')
('Chl2', 'Lingulodinium_polyedra')
('DATE', 'Lingulodinium_polyedra')
('DATE', 'Lingulodinium_polyedra') did not converge
('Lingulodinium_polyedra', 'AVG_DENS_kgm3')
('Lingulodinium_polyedra', 'AVG_SAL_PSU')
('Lingulodinium_polyedra', 'AVG_TEMP_C')
('Lingulodinium_polyedra', 'Akashiwo_sanguinea')
('Lingulodinium_polyedra', 'Ammonium')
('Lingulodinium_polyedra', 'Avg_Chloro')
('Lingulodinium_polyedra', 'Avg_Phaeo')
('Lingulodinium_polyedra', 'BOT_DENS_kgm3')
('Lingulodinium_polyedra'

  warn( msg )


('Lingulodinium_polyedra', 'Nitrite')
('Lingulodinium_polyedra', 'Phaeo1')
('Lingulodinium_polyedra', 'Phaeo2')
('Lingulodinium_polyedra', 'Phosphate')
('Lingulodinium_polyedra', 'SURF_DENS_kgm3')


  distRowScale = self.theta / distRowMean
  W = exp( -distRowScale[:,None] * self.knn_distances )


 ** On entry to DLASCL parameter number  4 had an illegal value
 ** On entry to DLASCL parameter number  4 had an illegal value
('Lingulodinium_polyedra', 'Nitrite') did not converge
('Lingulodinium_polyedra', 'SURF_SAL_PSU')
('Lingulodinium_polyedra', 'SURF_TEMP_C')
('Lingulodinium_polyedra', 'Silicate')
('Lingulodinium_polyedra', 'Temp')
('Lingulodinium_polyedra', 'Total_Cochlodinium_spp')
('Lingulodinium_polyedra', 'Total_Diatoms')
('Lingulodinium_polyedra', 'Total_Dinoflagellates')
('Lingulodinium_polyedra', 'Total_Phytoplankton')
('Lingulodinium_polyedra', 'Total_Prorocentrum_spp')
('Lingulodinium_polyedra', 'Total_Tripos')
('Lingulodinium_polyedra', 'WSPD')
('Lingulodinium_polyedra', 'index')
('Nitrate', 'Lingulodinium_polyedra')
('Nitrite', 'Lingulodinium_polyedra')
('Phaeo1', 'Lingulodinium_polyedra')
('Phaeo2', 'Lingulodinium_polyedra')
('Phosphate', 'Lingulodinium_polyedra')
('SURF_DENS_kgm3', 'Lingulodinium_polyedra')
('SURF_SAL_PSU', 'Lingulodinium_polyedra')
('SURF_TEMP_C'

In [13]:
# Get CCM results that show convergence (convergence p-value < 0.05)
results_df = pd.DataFrame(results)
ccm_cutoff = -1

significant_results = results_df[results_df.convergence_p_value<0.20]
significant_results = significant_results.sort_values(by='ccm_value', ascending=False)
significant_results = significant_results[['target (driver)', 'lib (driven)', 'E', 'tau', 'theta', 'ccm_value']].reset_index(drop=True)

display(significant_results[significant_results.ccm_value>ccm_cutoff])

# Choose system variables where the CCM value to or from the target is > ccm_cutoff
system_variables = significant_results[significant_results.ccm_value > ccm_cutoff]
system_variables = system_variables[['target (driver)', 'lib (driven)']].values.flatten().tolist()
system_variables = list(set(system_variables))
print('system variables: ')
display(sorted(system_variables))

Unnamed: 0,target (driver),lib (driven),E,tau,theta,ccm_value
0,Chl1,Lingulodinium_polyedra,2.0,-1.0,3.0,0.969719
1,Avg_Chloro,Lingulodinium_polyedra,2.0,-1.0,4.0,0.9652
2,Chl2,Lingulodinium_polyedra,2.0,-1.0,6.0,0.947083
3,Phaeo1,Lingulodinium_polyedra,2.0,-3.0,0.0,0.91856
4,Lingulodinium_polyedra,Chl1,12.0,-3.0,9.0,0.911672
5,Phaeo2,Lingulodinium_polyedra,2.0,-1.0,3.0,0.904435
6,Avg_Phaeo,Lingulodinium_polyedra,2.0,-1.0,1.0,0.879394
7,Lingulodinium_polyedra,Phaeo1,2.0,-2.0,7.0,0.876502
8,Lingulodinium_polyedra,Avg_Chloro,12.0,-3.0,9.0,0.849122
9,Lingulodinium_polyedra,Avg_Phaeo,2.0,-3.0,0.0,0.78369


system variables: 


['AVG_DENS_kgm3',
 'AVG_SAL_PSU',
 'Akashiwo_sanguinea',
 'Ammonium',
 'Avg_Chloro',
 'Avg_Phaeo',
 'BOT_SAL_PSU',
 'CellCountDetection_Limit',
 'Chl1',
 'Chl2',
 'Lingulodinium_polyedra',
 'Phaeo1',
 'Phaeo2',
 'Phosphate',
 'SURF_DENS_kgm3',
 'SURF_SAL_PSU',
 'SURF_TEMP_C',
 'Silicate',
 'Temp',
 'Total_Cochlodinium_spp',
 'Total_Diatoms',
 'Total_Dinoflagellates',
 'Total_Phytoplankton',
 'Total_Tripos',
 'WSPD']

In [14]:
results_df[results_df.convergence_p_value<0.9].sort_values(by='convergence_p_value', ascending=False)

Unnamed: 0,target (driver),lib (driven),E,tau,theta,E_tau_theta_results,ccm_value,convergence_p_value,correlation
38,Lingulodinium_polyedra,Total_Cochlodinium_spp,12.0,-2.0,9.0,E tau theta rho 0 2.0 -1.0...,0.227013,0.8909779,-0.001502
60,Total_Prorocentrum_spp,Lingulodinium_polyedra,12.0,-3.0,9.0,E tau theta rho 0 2.0 -1.0...,0.166313,0.8589938,0.008152
8,BOT_SAL_PSU,Lingulodinium_polyedra,12.0,-3.0,1.0,E tau theta rho 0 2.0 -1.0...,0.114813,0.8566853,0.015365
50,Phosphate,Lingulodinium_polyedra,12.0,-3.0,9.0,E tau theta rho 0 2.0 -1.0...,0.16193,0.8456245,-0.026014
43,Lingulodinium_polyedra,Total_Tripos,12.0,-3.0,9.0,E tau theta rho 0 2.0 -1.0...,0.12684,0.8423971,0.001131
9,BOT_TEMP_C,Lingulodinium_polyedra,12.0,-3.0,2.0,E tau theta rho 0 2.0 -1.0...,0.218602,0.831032,7.3e-05
39,Lingulodinium_polyedra,Total_Diatoms,11.0,-3.0,5.0,E tau theta rho 0 2.0 -1.0...,0.095217,0.7799015,-0.009189
63,index,Lingulodinium_polyedra,11.0,-2.0,2.0,E tau theta rho 0 2.0 -1.0...,0.52723,0.6982002,0.03254
7,BOT_DENS_kgm3,Lingulodinium_polyedra,12.0,-3.0,3.0,E tau theta rho 0 2.0 -1.0...,0.218769,0.6469625,0.005638
47,Nitrite,Lingulodinium_polyedra,5.0,-3.0,7.0,E tau theta rho 0 2.0 -1.0...,0.166731,0.6186216,-0.007606
