## Finding the mixed layer depth with Numba


In [13]:
from pathlib import Path

import pandas as pd
import numpy as np
import numba

import altair as alt

from IPython.display import Markdown, display
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


Create a function for markdown-style print formatting

In [14]:
def printmd(string):
    display(Markdown(string))

## Load the ARGO profile data

In [15]:
dataPath = Path("../data/raw/")
interimDataPath = Path("../data/interim/")

def loadData(duplicateN:int=1):
    df = pd.read_csv(dataPath / "atlanticInterpolated.csv")
    z = df.z.values
    dropColumns = []
    for col in df.columns:
        if df.loc[30, col] > df.loc[0, col] - 0.1:
            dropColumns.append(col)
    df = df.drop(columns=dropColumns)
    df = pd.concat([df for _ in range(duplicateN)], axis=1)
    temps = df.iloc[:, 1:].values
    temps = np.ascontiguousarray(temps)
    surfaceTemps = temps[:2, :].mean(axis=0)
    return df, temps, z, surfaceTemps

In [16]:
print("Load data")
df, temps, z, surfaceTemps = loadData(duplicateN=300)
print(f"Data loaded with shape: {df.shape}")

Load data
Data loaded with shape: (199, 744900)


In [17]:
df.head()

Unnamed: 0,z,0:June:2020,1:June:2020,2:July:2020,3:June:2021,4:June:2021,5:June:2021,8:May:2005,9:August:2016,10:June:2002,...,3915:August:2020,3916:August:2020,3917:September:2020,3918:September:2020,3919:September:2020,3920:October:2020,3921:October:2020,3922:October:2020,3923:November:2020,3924:November:2020
0,-3.0,15.655,14.549,15.049,12.237,14.069,14.178,13.578,14.729,14.81,...,18.712999,17.341,17.504,18.089001,16.77,15.748,15.24,14.937,14.277,14.013
1,-6.0,15.655,14.549,15.049,12.237,14.069,14.178,13.578,14.729,14.81,...,18.712999,17.341,17.504,18.089126,16.77,15.747917,15.24,14.937,14.277,14.013
2,-9.0,15.655,14.548529,15.049387,12.237,14.069,14.178,13.578,14.729,14.81,...,18.637727,17.3406,17.507999,17.936714,16.763,15.748,15.240636,14.936455,14.2748,14.0149
3,-12.0,15.655,14.545,15.053,12.237,14.068902,14.178,13.578,14.729,14.801539,...,18.509321,17.339786,17.4924,17.435621,16.769793,15.748645,15.24356,14.935667,14.278457,14.015333
4,-15.0,15.526556,14.53949,15.053,12.22939,14.065961,14.166511,13.578,14.729,14.788846,...,18.273146,17.340461,17.431673,17.029636,16.772415,15.749808,15.246787,14.936775,14.28,14.015583


## Define the Mixed Layer Depth temperature criterion

In [18]:
thresholdTemperatureDifference = 0.1

### Create a `numpy` function

In [19]:
def numpyNonZeroColumnWise(
    temps: np.ndarray,
    surfaceTemps: np.ndarray,
    thresholdTemperatureDifference: float,
    z: np.ndarray,
):  
    # Check if temperature satisifies MLD condition
    condition = temps < (surfaceTemps - thresholdTemperatureDifference)
    # Loop through profiles to find index of first depth to meet the MLD criterion
    mldIndex = np.array(
        [condition[:, colIdx].nonzero()[0][0] for colIdx in range(temps.shape[1])]
    )
    # Get the ML Depth
    mldDepth = np.array([z[idx] for idx in mldIndex])[:, np.newaxis]
    # Add an axis onto the ML Depth
    mldDepth = z[mldIndex][:, np.newaxis]
    # Get the MLD Temperature
    mldTemp = np.array([temps[idx, colIdx] for colIdx, idx in enumerate(mldIndex)])[
        :, np.newaxis
    ]
    return mldDepth, mldTemp

mldnumpyNonZeroColumnWise, numpymldTempNonZeroColumnWise = numpyNonZeroColumnWise(
    temps=temps, surfaceTemps=surfaceTemps, thresholdTemperatureDifference=0.1, z=z
)

## Lets make a visualisation to test if the calculation is working

In [20]:
def createVisualisationDataframes(df:pd.DataFrame,mlDepth:np.ndarray,mldTemp:np.ndarray):
    meltDf = pd.melt(
        pd.concat([
            df.iloc[:,[0]],
            df.iloc[:,1:].sample(10,axis=1).drop(columns=['z'],errors='ignore')],axis=1),id_vars=['z'])
    mldDf = pd.DataFrame(np.hstack([mlDepth,mldTemp]),index=df.columns[1:],columns=["mld","mldTemp"]).reset_index()
    return meltDf,mldDf
meltDf,mldDf = createVisualisationDataframes(df=df,mlDepth=mldnumpyNonZeroColumnWise,mldTemp=numpymldTempNonZeroColumnWise)

In [21]:
sampleQueries = meltDf.variable.sample(5)
tempLines = alt.Chart(meltDf.loc[meltDf.variable.isin(sampleQueries)],
                      title="Sample temperature profiles (lines) with predicted mixed layer depth (circles)",
                     height=400).mark_line().encode(
    x=alt.X('value:Q',title="Temperature"),
    y=alt.Y('z:Q',title="Depth (m)"),
    color=alt.Color('variable:N',title="Profile number and date"))
mldSpots = alt.Chart(mldDf.loc[mldDf.loc[:,'index'].isin(sampleQueries)]).mark_circle(size=100).encode(
    x="mldTemp:Q",
    y="mld:Q",
    color="index:N")
alt.layer(*(tempLines,mldSpots))

# Define the Numba functions

### Decorate the function
We tell `numba` which function to compile by wrapping it with the `@numba.njit()` decorator.

In this case there was no performance speedup by simply wrapping the `numpy` function above. Instead we had to re-write the function as an explicit loop

In [22]:
@numba.njit()
def numbaConditionLoop(
    temps: np.ndarray,
    surfaceTemps: np.ndarray,
    thresholdTemperatureDifference: float,
    z: np.ndarray,
): 
    # Pre-allocate the 1D output arrays
    mlDepth = np.empty_like(surfaceTemps)
    mldTemp = np.empty_like(surfaceTemps)
    # Loop through the column profiles
    for col in range(temps.shape[1]):
        row = 0
        temperature = temps[row, col]
        surfaceTemp = surfaceTemps[col]
        threshold = surfaceTemp - thresholdTemperatureDifference
        # Go down through the profile until the first depth that meets
        # the MLD criterion is found
        while (temperature > threshold) and row < temps.shape[0]:
            row += 1
            temperature = temps[row, col]
        mlDepth[col] = z[int(row)]
        mldTemp[col] = temps[int(row), col]
    return mlDepth, mldTemp
mldnumbaConditionLoop, mldTempnumbaConditionLoop = numbaConditionLoop(
    temps=temps, surfaceTemps=surfaceTemps, thresholdTemperatureDifference=0.1, z=z
)

### Numba-parallel function
We also generate a parallel `numba` function. In this case we specify `parallel=True` in the decorator and we use
`numba.prange` in the loop operation.

In [23]:
@numba.njit(parallel=True)
def numbaConditionLoopParallel(
    temps: np.ndarray,
    surfaceTemps: np.ndarray,
    thresholdTemperatureDifference: float,
    z: np.ndarray,
):
    mlDepth = np.empty((surfaceTemps.shape[0],1))
    mldTemp = np.empty((surfaceTemps.shape[0],1))
    for col in numba.prange(temps.shape[1]):
        row = 0
        temperature = temps[row, col]
        surfaceTemp = surfaceTemps[col]
        threshold = surfaceTemp - thresholdTemperatureDifference
        while (temperature > threshold) and row < temps.shape[0]:
            row += 1
            temperature = temps[row, col]
        mlDepth[col] = z[int(row)]
        mldTemp[col] = temps[int(row), col]
    return mlDepth, mldTemp
# Run the function to make sure it works
(
    mldnumbaConditionLoopParallel,
    mldTempnumbaConditionLoopParallel,
) = numbaConditionLoopParallel(
    temps=temps, surfaceTemps=surfaceTemps, thresholdTemperatureDifference=0.1, z=z
)
# Test that the outputs are the same as Numpy
np.testing.assert_array_almost_equal(
    mldnumpyNonZeroColumnWise, mldnumbaConditionLoopParallel
)


In [25]:
printmd("**Numpy**")
%timeit -n 1 -r 3 numpyNonZeroColumnWise(temps=temps,surfaceTemps=surfaceTemps,thresholdTemperatureDifference=0.1,z=z)
printmd("**Numba loop (serial)**")
%timeit -n 1 -r 3 numbaConditionLoop(temps=temps,surfaceTemps=surfaceTemps,thresholdTemperatureDifference=0.1,z=z)
printmd("**Numba loop (parallel)**")
%timeit -n 1 -r 3 numbaConditionLoopParallel(temps=temps,surfaceTemps=surfaceTemps,thresholdTemperatureDifference=0.1,z=z)

**Numpy**

1.77 s ± 146 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


**Numba loop (serial)**

28 ms ± 917 µs per loop (mean ± std. dev. of 3 runs, 1 loop each)


**Numba loop (parallel)**

17.9 ms ± 3.27 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
