# SST gradient

This notebook produces the SST gradient that will be compared against the d15N gradient in a separate notebook. The steps are as follows:

- Define options
- Read in source data excel file
- Apply filters based on various indices
- Reject by site (if any specified)
- Separate data in to low & mid latitude vs. high latitude
- Run Lo(w)ess to calculate data for regularly sampled ages
- Calculate gradients between low & mid latitudes and high latitudes

Import required packages and define options, including

- exclusion thresholds
- smoothing
- ages to drop
- columns to process
- plotting ranges
- clipping

In [1]:
from loguru import logger
from pathlib import Path
from typing import List, Tuple, Optional
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import paleos.interpolate as pin

from utils import get_SST_source_data, fill_nans, apply_final_filtering

# SITES TO EXCLUDE
SITES_TO_EXCLUDE = []
# SMOOTHING
SMOOTHING_METHOD = "LOESS"
SMOOTHING_FACTOR = 0.02
TIME_STEP = 0.2
OUTPUT_AGES = np.arange(0, 57.5, TIME_STEP)
# AGES TO DROP FROM FINAL RESULT
NAN_AGES = True
AGES_TO_NAN = [[0.2, 4.4], [17.6, 22.5], [30.9, 31.7]]
# COLUMNS TO PROCESS & PLOTTING RANGES
COLUMNS = ["SST Bayspar (°C)"]
PLOT_RANGE = [-5, 30]
# CLIP GRADIENTS
CLIP_GRADIENTS = False
MIN_GRADIENT = -5
MAX_GRADIENT = 25


Define output paths and read the SST data source file. Set the age as the index after converting it to a float (decimals).

In [2]:
output_path = Path("data", "gradients")
if not output_path.exists():
    output_path.mkdir()
df = get_SST_source_data()
df["Age (Ma)"] = df["Age (Ma)"].astype(float)
df = df.set_index("Age (Ma)")
df = df.sort_index()
df.index.name = "Age (Ma)"


Do a quality control plot of the data

In [3]:
fig = px.scatter(
    df.dropna(subset=["Latitude category"]),
    y="SST Bayspar (°C)",
    color="Latitude category",
    hover_data=[
        "Site",
        "TEX",
        "SSTH (°C)",
        "SST Bayspar (°C)",
        "MI",
        "%GDGTrs",
        "Cren'",
        "Reference",
        "Age-adjusted paleolatitude",
        "Latitude category",
    ],
)
fig.update_layout(
    title_text=f"SST points",
    height=500,
    margin=go.layout.Margin(l=60, r=80, b=60, t=80),
    xaxis_title="Age (Ma)",
    yaxis_title="Temperature (C)",
)
fig.update_xaxes(range=[0, 60])
fig.update_yaxes(range=[-5, 50])
fig.show()


Fill some NaN values with default values to help select data points later. Then apply the final filtering (acceptance and rejection of samples) based on set thresholds (see utils.py for more details).

In [4]:
df = fill_nans(df)
filtered, postpend = apply_final_filtering(df)
filtered.to_csv(output_path / f"SST_filtered{postpend}.csv")


2022-07-15 17:40:42.399 | INFO     | utils:filter_mi:80 - Filtering on methane index < 0.4
2022-07-15 17:40:42.401 | INFO     | utils:filter_mi:81 - number of rows pre filter = 5170
2022-07-15 17:40:42.415 | INFO     | utils:filter_mi:83 - number of rows post filter = 5006
2022-07-15 17:40:42.417 | INFO     | utils:filter_gdgtrs:89 - Filtering on GDGTRS < 30
2022-07-15 17:40:42.419 | INFO     | utils:filter_gdgtrs:90 - number of rows pre filter = 5006
2022-07-15 17:40:42.430 | INFO     | utils:filter_gdgtrs:92 - number of rows post filter = 4800
2022-07-15 17:40:42.431 | INFO     | utils:filter_bit_ringstetra:107 - Filtering on BIT > 0.4 and RINGSTETRA < 0.7
2022-07-15 17:40:42.434 | INFO     | utils:filter_bit_ringstetra:110 - number of rows pre filter = 4800
2022-07-15 17:40:42.445 | INFO     | utils:filter_bit_ringstetra:114 - number of rows post filter = 4518


Rejection of any specified sites

In [5]:
def remove_sites(df: pd.DataFrame) -> pd.DataFrame:
    """Remove specified sites from the data"""
    logger.info(f"Removing sites {SITES_TO_EXCLUDE}")
    return df[~df["Site"].isin(SITES_TO_EXCLUDE)]


original = df.copy()
original = remove_sites(original)
filtered = remove_sites(filtered)


2022-07-15 17:40:44.160 | INFO     | __main__:remove_sites:3 - Removing sites []
2022-07-15 17:40:44.170 | INFO     | __main__:remove_sites:3 - Removing sites []


Separate out latitude categories

- Low and mid latitude are one set
- High latitude is the other set

In [6]:
def separate_latitudes(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Separate low and mid latitudes from high latitudes"""
    low_and_mid = df[df["Latitude category"].isin(["low latitude", "mid latitude"])]
    high = df[df["Latitude category"] == "high latitude"]
    return low_and_mid, high


original_low_and_mid, original_high = separate_latitudes(original)
filtered_low_and_mid, filtered_high = separate_latitudes(filtered)


Plots of high latitude data points vs mid and low latitude data points for:

- Data without any index based filtering 
- Data filtered on various indices (e.g. methane index, GDGTRS etc.)

In [7]:
def plot_lats(
    fig,
    low_and_mid: pd.DataFrame,
    high: pd.DataFrame,
    columns: List[str],
    mode: str = "lines+markers",
):
    for idx, col in enumerate(columns):
        fig.add_trace(
            go.Scatter(
                x=low_and_mid.index,
                y=low_and_mid[col],
                mode=mode,
                legendgroup="Low and mid",
                name="Low and mid",
                line=dict(color="orange"),
            ),
            row=idx + 1,
            col=1,
        )
        fig.add_trace(
            go.Scatter(
                x=high.index,
                y=high[col],
                mode=mode,
                legendgroup="High",
                name="High",
                line=dict(color="blue"),
            ),
            row=idx + 1,
            col=1,
        )
    return fig


# high vs low and mid latitudes for the original data
fig = make_subplots(
    rows=len(COLUMNS),
    cols=1,
    shared_xaxes=True,
    subplot_titles=[f"No filtering {x}" for x in COLUMNS],
    vertical_spacing=0.05,
)
fig = plot_lats(fig, original_low_and_mid, original_high, COLUMNS)
fig.update_layout(
    height=400,
    margin=go.layout.Margin(l=70, r=40, b=60, t=40),
    xaxis_title="Age (Ma)",
    yaxis_title="Temperature (C)",
)
fig.show()

# high vs low and mid latitudes for the filtered data
fig = make_subplots(
    rows=len(COLUMNS),
    cols=1,
    shared_xaxes=True,
    subplot_titles=[f"Filtered {x}" for x in COLUMNS],
    vertical_spacing=0.05,
)
fig = plot_lats(fig, filtered_low_and_mid, filtered_high, COLUMNS)
fig.update_layout(
    height=400,
    margin=go.layout.Margin(l=70, r=40, b=60, t=40),
    xaxis_title="Age (Ma)",
    yaxis_title="Temperature (C)",
)
fig.show()


Now calculate the gradient for:

- the non-filtered data
- the filtered data

Calculating the gradient involves:

- Using Lo(w)ess to resample the SST data for high latitudes to regular intervals
- Using Lo(w)ess to resample the SST data for mid and low latitudes to regular intervals
- Calculating the gradient by taking the difference of the two

In [8]:
def lowess_run(
    series: pd.Series,
    smoothing_factor: float,
    output_ages: np.ndarray,
    method: str = "LOWESS",
) -> pd.Series:
    """Run the Lo(w)ess resampling/smoothing for a single series"""
    series = series.dropna()
    if method.lower() == "lowess":
        logger.info("Using LOWESS with statsmodels")
        output = pin.lowess_sm_interpolate(series, output_ages, smoothing_factor)
    elif method.lower() == "loess":
        logger.info("Using LOESS with external function")
        output = pin.loess_ext_interpolate(series, output_ages, smoothing_factor)
    else:
        raise ValueError(f"Unknown method {method}")
    return output


def lowess_gradient(
    low_and_mid: pd.DataFrame,
    high: pd.DataFrame,
    smoothing_factor,
    output_ages: np.ndarray,
    columns: List[str],
    method: str = "LOWESS",
) -> pd.DataFrame:
    """Run the Lo(w)ess and calculate the gradient"""
    data = []
    for col in columns:
        # get lowess for low and mid latitudes
        low_and_mid_lowess = lowess_run(
            low_and_mid[col], smoothing_factor, output_ages, method=method
        )
        # get lowess for high latitudes
        high_lowess = lowess_run(
            high[col], smoothing_factor, output_ages, method=method
        )
        # calculate gradients
        gradient = low_and_mid_lowess - high_lowess
        col_df = pd.DataFrame(
            data={
                f"{col} Low Mid {SMOOTHING_METHOD}": low_and_mid_lowess,
                f"{col} High {SMOOTHING_METHOD}": high_lowess,
                f"{col} Gradient": gradient,
            }
        )
        data.append(col_df)
    return pd.concat(data, axis=1)


# non filtered data
original_gradients = lowess_gradient(
    original_low_and_mid,
    original_high,
    SMOOTHING_FACTOR,
    OUTPUT_AGES,
    COLUMNS,
    method=SMOOTHING_METHOD,
)
# filtered data
filtered_gradients = lowess_gradient(
    filtered_low_and_mid,
    filtered_high,
    SMOOTHING_FACTOR,
    OUTPUT_AGES,
    COLUMNS,
    method=SMOOTHING_METHOD,
)


2022-07-15 17:40:44.772 | INFO     | __main__:lowess_run:13 - Using LOESS with external function
2022-07-15 17:40:44.775 | INFO     | paleos.interpolate:loess_ext_interpolate:312 - Series size 2733, smoothing factor 0.02, points = 54
2022-07-15 17:40:44.889 | INFO     | __main__:lowess_run:13 - Using LOESS with external function
2022-07-15 17:40:44.890 | INFO     | paleos.interpolate:loess_ext_interpolate:312 - Series size 1013, smoothing factor 0.02, points = 20
2022-07-15 17:40:44.991 | INFO     | __main__:lowess_run:13 - Using LOESS with external function
2022-07-15 17:40:44.993 | INFO     | paleos.interpolate:loess_ext_interpolate:312 - Series size 2366, smoothing factor 0.02, points = 47
2022-07-15 17:40:45.117 | INFO     | __main__:lowess_run:13 - Using LOESS with external function
2022-07-15 17:40:45.118 | INFO     | paleos.interpolate:loess_ext_interpolate:312 - Series size 780, smoothing factor 0.02, points = 15


Depending on the options selected, perform some post processing. This might involve:

- Setting some ages to NaN. This is useful in places where there are very sparse or no data points
- Clipping gradients to a minimum and maximum value

In [9]:
def set_ages_to_nan(gradients: pd.DataFrame, ages_to_nan: np.ndarray) -> pd.DataFrame:
    """Set some ages to NaN values"""
    logger.info(f"Setting following ages to NaN {ages_to_nan}")
    for col in COLUMNS:
        for nan_section in ages_to_nan:
            gradients.loc[nan_section[0] : nan_section[1], f"{col} Gradient"] = np.nan
    return gradients


if NAN_AGES:
    logger.info("Setting some ages to NaN")
    original_gradients = set_ages_to_nan(original_gradients, AGES_TO_NAN)
    filtered_gradients = set_ages_to_nan(filtered_gradients, AGES_TO_NAN)

if CLIP_GRADIENTS:
    logger.info(f"Clipping gradients between {MIN_GRADIENT} and {MAX_GRADIENT}")
    for col in COLUMNS:
        gradient_col = f"{col} Gradient"
        original_gradients[gradient_col] = original_gradients[gradient_col].clip(
            lower=MIN_GRADIENT, upper=MAX_GRADIENT
        )
        filtered_gradients[gradient_col] = filtered_gradients[gradient_col].clip(
            lower=MIN_GRADIENT, upper=MAX_GRADIENT
        )


2022-07-15 17:40:45.295 | INFO     | __main__:<cell line: 10>:11 - Setting some ages to NaN
2022-07-15 17:40:45.297 | INFO     | __main__:set_ages_to_nan:3 - Setting following ages to NaN [[0.2, 4.4], [17.6, 22.5], [30.9, 31.7]]
2022-07-15 17:40:45.301 | INFO     | __main__:set_ages_to_nan:3 - Setting following ages to NaN [[0.2, 4.4], [17.6, 22.5], [30.9, 31.7]]


Plot the final calculated gradients

In [10]:
def plot_gradients(
    fig,
    df: pd.DataFrame,
    COLUMNS: List[str],
    mode: str = "lines+markers",
):
    """Plot gradient data"""
    for col in COLUMNS:
        col_name = f"{col} Gradient"
        col_data = df[col_name]
        fig.add_trace(go.Scatter(x=col_data.index, y=col_data, mode=mode, name=col))
    return fig


# data with no filtering
fig = go.Figure()
fig = plot_gradients(fig, original_gradients, COLUMNS)
fig.update_layout(
    title_text="Data with no index based filtering",
    height=400,
    margin=go.layout.Margin(l=80, r=60, b=60, t=60),
    xaxis_title="Age (Ma)",
    yaxis=dict(title=f"SST gradient (C) {SMOOTHING_METHOD}", range=PLOT_RANGE),
)
fig.show()

# filtered data
fig = go.Figure()
fig = plot_gradients(fig, filtered_gradients, COLUMNS)
fig.update_layout(
    title_text=f"Data filtered on various indices",
    height=400,
    margin=go.layout.Margin(l=80, r=60, b=60, t=60),
    xaxis_title="Age (Ma)",
    yaxis=dict(title=f"SST gradient (C) {SMOOTHING_METHOD}", range=PLOT_RANGE),
)
fig.show()


Save the gradient data

In [11]:
original_low_and_mid[COLUMNS].to_csv(
    output_path / f"SST_{SMOOTHING_METHOD}_all_mid_and_low.csv"
)
original_high[COLUMNS].to_csv(output_path / f"SST_{SMOOTHING_METHOD}_all_high.csv")
original_gradients.to_csv(output_path / f"SST_{SMOOTHING_METHOD}_all_gradients.csv")

filtered_low_and_mid[COLUMNS].to_csv(
    output_path / f"SST_{SMOOTHING_METHOD}{postpend}_mid_and_low.csv"
)
filtered_high[COLUMNS].to_csv(output_path / f"SST_{SMOOTHING_METHOD}{postpend}_high.csv")
filtered_gradients.to_csv(output_path / f"SST_{SMOOTHING_METHOD}{postpend}_gradients.csv")
