Merge pull request #85 from abstractqqq/convolution

abstractqqq · Feb 26, 2024 · 33c03f6 · 33c03f6
2 parents 48c59bb + 8852548
commit 33c03f6
Show file tree

Hide file tree

Showing 12 changed files with 339 additions and 41 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "polars_ds"
-version = "0.3.2"
+version = "0.3.3"
 edition = "2021"
 
 [lib]

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "maturin"
 [project]
 name = "polars_ds"
 requires-python = ">=3.8"
-version = "0.3.2"
+version = "0.3.3"
 
 license = {file = "LICENSE.txt"}
 classifiers = [

diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py
@@ -9,7 +9,7 @@
 from polars_ds.metrics import MetricExt  # noqa: E402
 from polars_ds.graph import GraphExt  # noqa: E402
 
-__version__ = "0.3.2"
+__version__ = "0.3.3"
 __all__ = ["NumExt", "StrExt", "StatsExt", "ComplexExt", "MetricExt", "GraphExt"]
 
 

diff --git a/python/polars_ds/num.py b/python/polars_ds/num.py
@@ -2,7 +2,7 @@
 import math
 import polars as pl
 from typing import Union, Optional, List
-from .type_alias import DetrendMethod, Distance
+from .type_alias import DetrendMethod, Distance, ConvMode
 from polars.utils.udfs import _get_shared_lib_location
 
 _lib = _get_shared_lib_location(__file__)
@@ -1075,6 +1075,36 @@ def psi_discrete(
             returns_scalar=True,
         )
 
+    def convolve(
+        self,
+        other: Union[List[float], "np.ndarray", pl.Series],  # noqa: F821
+        mode: ConvMode = "full",
+    ) -> pl.Expr:
+        """
+        Performs a convolution with the filter via FFT. The current implementation's performance is worse
+        than SciPy but offers parallelization within Polars Context.
+
+        parameters
+        ----------
+        other
+            The filter for the convolution. Anything that can be turned into a Polars Series will work.
+        mode
+            Please check the reference. One of `same`, `left` (left-aligned same), `right` (right-aligned same),
+            `valid` or `full`.
+
+        Reference
+        ---------
+        https://brianmcfee.net/dstbook-site/content/ch03-convolution/Modes.html
+        """
+
+        filter_ = pl.Series(values=other, dtype=pl.Float64)
+        return self._expr.cast(pl.Float64).register_plugin(
+            lib=_lib,
+            symbol="pl_fft_convolve",
+            args=[filter_, pl.lit(mode, dtype=pl.String)],
+            changes_length=True,
+        )
+
     def _haversine(
         self,
         x_long: pl.Expr,

diff --git a/python/polars_ds/type_alias.py b/python/polars_ds/type_alias.py
@@ -12,3 +12,4 @@
 Alternative: TypeAlias = Literal["two-sided", "less", "greater"]
 ROCAUCStrategy: TypeAlias = Literal["macro", "weighted"]
 Distance: TypeAlias = Literal["l1", "l2", "inf", "h", "cosine", "haversine"]
+ConvMode: TypeAlias = Literal["same", "left", "right", "full", "valid"]
diff --git a/src/num/convolve.rs b/src/num/convolve.rs
@@ -0,0 +1,130 @@
+use itertools::Itertools;
+use polars::prelude::*;
+use pyo3_polars::derive::polars_expr;
+use realfft::RealFftPlanner;
+
+enum ConvMode {
+    FULL,
+    SAME,
+    LEFT,
+    RIGHT,
+    VALID,
+}
+
+impl From<&str> for ConvMode {
+    fn from(value: &str) -> Self {
+        match value.to_lowercase().as_ref() {
+            "full" => Self::FULL,
+            "same" => Self::SAME,
+            "left" => Self::LEFT,
+            "right" => Self::RIGHT,
+            "valid" => Self::VALID,
+            _ => Self::FULL,
+        }
+    }
+}
+
+// fn next_pow_2(n:usize) -> usize {
+//     let mut m:usize = 2;
+//     while m < n {
+//         m <<= 1;
+//     }
+//     m
+// }
+
+// Pad to 2^n size and make this faster?
+fn valid_fft_convolve(input: &[f64], filter: &[f64]) -> PolarsResult<Vec<f64>> {
+    let in_shape = input.len();
+    // let good_size = next_pow_2(in_shape);
+    // Prepare
+    let mut output_vec = vec![0.; in_shape];
+    output_vec[..in_shape].copy_from_slice(input);
+
+    let mut oth = vec![0.; in_shape];
+    oth[..filter.len()].copy_from_slice(filter);
+
+    // let n = output_vec.len() as f64;
+    let mut planner: RealFftPlanner<f64> = RealFftPlanner::new();
+    let r2c = planner.plan_fft_forward(in_shape);
+    let c2r = planner.plan_fft_inverse(in_shape);
+    let mut spec_p = r2c.make_output_vec();
+    let mut spec_q = r2c.make_output_vec();
+    // Forward FFT on the inputs
+    let _ = r2c.process(&mut output_vec, &mut spec_p);
+    // .map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
+    let _ = r2c.process(&mut oth, &mut spec_q);
+    // .map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
+
+    // After forward FFT, multiply in place in spec_p.
+    for (z1, z2) in spec_p.iter_mut().zip(spec_q.into_iter()) {
+        *z1 = *z1 * z2;
+    }
+    // Inverse FFT
+    let _ = c2r.process(&mut spec_p, &mut output_vec);
+    // .map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
+
+    // output_vec.truncate(in_shape);
+    Ok(output_vec)
+}
+
+fn fft_convolve(input: &[f64], filter: &[f64], mode: ConvMode) -> PolarsResult<Vec<f64>> {
+    match mode {
+        ConvMode::FULL => {
+            let t = filter.len() - 1;
+            let mut padded_input = vec![0.; input.len() + 2 * t];
+            let from_to = t..(t + input.len());
+            padded_input[from_to].copy_from_slice(input);
+            fft_convolve(&padded_input, filter, ConvMode::VALID)
+        }
+        ConvMode::SAME => {
+            let skip = (filter.len() - 1) / 2;
+            let out = fft_convolve(input, filter, ConvMode::FULL)?;
+            Ok(out.into_iter().skip(skip).take(input.len()).collect())
+        }
+        ConvMode::LEFT => {
+            let n = input.len();
+            let mut out = fft_convolve(input, filter, ConvMode::FULL)?;
+            out.truncate(n);
+            Ok(out)
+        }
+        ConvMode::RIGHT => {
+            let out = fft_convolve(input, filter, ConvMode::FULL)?;
+            Ok(out.into_iter().skip(filter.len() - 1).collect())
+        }
+        ConvMode::VALID => {
+            let out = valid_fft_convolve(input, filter)?;
+            let n = out.len() as f64;
+            Ok(out
+                .into_iter()
+                .skip(filter.len() - 1)
+                .map(|x| x / n)
+                .collect())
+        }
+    }
+}
+
+#[polars_expr(output_type=Float64)]
+fn pl_fft_convolve(inputs: &[Series]) -> PolarsResult<Series> {
+    let s1 = inputs[0].f64()?;
+    let s2 = inputs[1].f64()?;
+    let mode = inputs[2].str()?;
+    let mode = mode.get(0).unwrap_or("full");
+    let mode: ConvMode = mode.into();
+
+    if s1.len() < s2.len() || s2.len() < 2 {
+        return Err(PolarsError::ComputeError(
+            "Convolution: The filter should have smaller length than the input column, and filter should have length >= 2.".into(),
+        ));
+    }
+
+    let input = s1.rechunk();
+    let input = input.cont_slice().unwrap();
+
+    let other = s2.rechunk();
+    let other = other.cont_slice().unwrap();
+
+    let out = fft_convolve(input, other, mode)?;
+
+    let ca = Float64Chunked::from_slice(s1.name(), &out);
+    Ok(ca.into_series())
+}
diff --git a/src/num/mod.rs b/src/num/mod.rs
@@ -3,7 +3,7 @@ use num::Float;
 use polars::error::{PolarsError, PolarsResult};
 
 mod cond_entropy;
-mod target_encode;
+mod convolve;
 mod entrophies;
 mod fft;
 mod float_extras;
@@ -14,6 +14,7 @@ mod knn;
 mod lempel_ziv;
 mod ols;
 mod psi;
+mod target_encode;
 mod tp_fp;
 mod trapz;
 mod woe_iv;

diff --git a/src/num/target_encode.rs b/src/num/target_encode.rs
@@ -17,36 +17,38 @@ pub(crate) struct TargetEncodeKwargs {
 
 #[inline(always)]
 fn get_target_encode_frame(
-    discrete_col:&Series, 
-    target:&Series,
-    target_mean:f64,
-    min_samples_leaf:f64, 
-    smoothing:f64
+    discrete_col: &Series,
+    target: &Series,
+    target_mean: f64,
+    min_samples_leaf: f64,
+    smoothing: f64,
 ) -> PolarsResult<LazyFrame> {
-
     let df = df!(
         "values" => discrete_col.cast(&DataType::String)?,
         "target" => target
     )?;
 
-    Ok(
-        df.lazy().group_by([col("values")]).agg([
-            len().alias("cnt"),
-            col("target").mean().alias("cond_p")
-        ]).with_column(
-            (lit(1f64) 
-            / (lit(1f64) + ((-(col("cnt").cast(DataType::Float64) - lit(min_samples_leaf))/lit(smoothing)).exp()))).alias("alpha")
-        ).select([
+    Ok(df
+        .lazy()
+        .group_by([col("values")])
+        .agg([len().alias("cnt"), col("target").mean().alias("cond_p")])
+        .with_column(
+            (lit(1f64)
+                / (lit(1f64)
+                    + ((-(col("cnt").cast(DataType::Float64) - lit(min_samples_leaf))
+                        / lit(smoothing))
+                    .exp())))
+            .alias("alpha"),
+        )
+        .select([
             col("values"),
-            (col("alpha") * col("cond_p") + (lit(1f64) - col("alpha")) * lit(target_mean)).alias("to")
-        ])
-    )
-
+            (col("alpha") * col("cond_p") + (lit(1f64) - col("alpha")) * lit(target_mean))
+                .alias("to"),
+        ]))
 }
 
 #[polars_expr(output_type_func=target_encode_output)]
 fn pl_target_encode(inputs: &[Series], kwargs: TargetEncodeKwargs) -> PolarsResult<Series> {
-
     // Inputs[0] and inputs[1] are the string column and the target respectively
 
     let target_mean = inputs[2].f64()?;
@@ -60,9 +62,9 @@ fn pl_target_encode(inputs: &[Series], kwargs: TargetEncodeKwargs) -> PolarsResu
         &inputs[1],
         target_mean,
         min_samples_leaf,
-        smoothing
-    )?.collect()?;
+        smoothing,
+    )?
+    .collect()?;
 
     Ok(encoding_frame.into_struct("target_encoded").into_series())
-
-}
+}
diff --git a/src/num/woe_iv.rs b/src/num/woe_iv.rs
@@ -18,10 +18,7 @@ fn iv_output(_: &[Field]) -> PolarsResult<Field> {
 /// Get a lazyframe needed to compute WOE.
 /// Inputs[0] by default is the discrete bins / categories
 /// Inputs[1] by default is the target (0s and 1s)
-fn get_woe_frame(
-    discrete_col:&Series,
-    target:&Series
-) -> PolarsResult<LazyFrame> {
+fn get_woe_frame(discrete_col: &Series, target: &Series) -> PolarsResult<LazyFrame> {
     // let categories = &inputs[1].cast(&DataType::String)?;
     let df = df!(
         "values" => discrete_col.cast(&DataType::String)?,