Merge pull request #182 from abstractqqq/time_series_features

abstractqqq · Jun 13, 2024 · 5245d18 · 5245d18
2 parents aaeb89e + 252046d
commit 5245d18
Show file tree

Hide file tree

Showing 5 changed files with 189 additions and 206 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
     {name = "Tianren Qin", email = "tq9695@gmail.com"},
 ]
 dependencies = [
-    "polars >= 0.20.6, !=0.20.12",
+    "polars >= 0.20.6, !=0.20.12, <1.0",
     'typing-extensions; python_version <= "3.11"',
 ] 
 

diff --git a/python/polars_ds/num.py b/python/polars_ds/num.py
@@ -56,199 +56,6 @@
 ]
 
 
-# @pl.api.register_expr_namespace("num")
-# class NumExt:
-#     """
-#     This class contains tools for dealing with well-known numerical operations and other metrics inside Polars DataFrame.
-#     All the metrics/losses provided here is meant for use in cases like evaluating models outside training,
-#     not for actual use in ML models.
-
-#     Polars Namespace: num
-
-#     Example: pl.col("a").num.range_over_mean()
-#     """
-
-#     def __init__(self, expr: pl.Expr):
-#         self._expr: pl.Expr = expr
-
-#     def std_err(self, ddof: int = 1) -> pl.Expr:
-#         """
-#         Estimates the standard error for the mean of the expression.
-#         """
-#         return self._expr.std(ddof=ddof) / self._expr.count().sqrt()
-
-#     def std_over_range(self, ddof: int = 1) -> pl.Expr:
-#         """
-#         Computes the standard deviation over the range.
-#         """
-#         return self._expr.std(ddof=ddof) / (self._expr.max() - self._expr.min())
-
-#     def rms(self) -> pl.Expr:
-#         """
-#         Returns root mean square of the expression
-#         """
-#         return (self._expr.dot(self._expr) / self._expr.count()).sqrt()
-
-#     def cv(self, ddof: int = 1) -> pl.Expr:
-#         """
-#         Returns the coefficient of variation of the expression
-#         """
-#         return self._expr.std(ddof=ddof) / self._expr.mean()
-
-#     def yeo_johnson(self, lam: float) -> pl.Expr:
-#         """
-#         Performs the Yeo Johnson transform with parameters lambda.
-
-#         Unfortunately, the package does not provide estimate for lambda as of now.
-
-#         Parameters
-#         ----------
-#         lam
-#             The lambda in Yeo Johnson transform
-
-#         Reference
-#         ---------
-#         https://en.wikipedia.org/wiki/Power_transform
-#         """
-#         x = self._expr
-
-#         if lam == 0:  # log(x + 1)
-#             x_ge = x.log1p()
-#         else:  # ((x + 1)**lmbda - 1) / lmbda
-#             x_ge = ((1 + x).pow(lam) - 1) / lam
-
-#         if lam == 2:  # -log(-x + 1)
-#             x_lt = pl.lit(-1) * (-x).log1p()
-#         else:  #  -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda)
-#             t = 2 - lam
-#             x_lt = -((1 - x).pow(t) - 1) / t
-
-#         return pl.when(x >= 0.0).then(x_ge).otherwise(x_lt)
-
-#     def box_cox(self, lam: float, lam2: float = 0.0) -> pl.Expr:
-#         """
-#         Performs the two-parameter Box Cox transform with parameters lambda. This
-#         transform is only valid for values >= -lam2. Every other value will be mapped to None.
-
-#         Unfortunately, the package does not provide estimate for lambda as of now.
-
-#         Parameters
-#         ----------
-#         lam
-#             The first lambda in Box Cox transform
-#         lam2
-#             The second lambda in Box Cox transform
-
-#         Reference
-#         ---------
-#         https://en.wikipedia.org/wiki/Power_transform
-#         """
-#         if lam2 == 0.0:
-#             x = self._expr
-#             cond = self._expr > 0
-#         else:
-#             x = self._expr + lam2
-#             cond = self._expr > -lam2
-
-#         if lam == 0.0:
-#             return pl.when(cond).then(x.log()).otherwise(None)
-#         else:
-#             return pl.when(cond).then((x.pow(lam) - 1) / lam).otherwise(None)
-
-#     def max_abs(self) -> pl.Expr:
-#         """
-#         Returns the maximum of absolute values of self.
-#         """
-#         return pl.max_horizontal(self._expr.max().abs(), self._expr.min().abs())
-
-#     def n_bins(self, n: int) -> pl.Expr:
-#         """
-#         Maps values in this series into n bins, with each bin having equal size. This ensures that
-#         the bins' ranges are the same, unlike quantiles. This may have tiny numerical errors but
-#         should be tolerable.
-
-#         Parameters
-#         ----------
-#         n
-#             Any positive integer
-#         """
-#         if n <= 0:
-#             raise ValueError("Input `n` must be positive.")
-
-#         x = self._expr
-#         return (
-#             (x - x.min()).floordiv(pl.lit(1e-12) + (x.max() - x.min()) / pl.lit(n)).cast(pl.UInt32)
-#         )
-
-#     def count_max(self) -> pl.Expr:
-#         """
-#         Count the number of occurrences of max.
-#         """
-#         return (self._expr == self._expr.max()).sum()
-
-#     def count_min(self) -> pl.Expr:
-#         """
-#         Count the number of occurrences of min.
-#         """
-#         return (self._expr == self._expr.min()).sum()
-
-#     def is_equidistant(self, tol: float = 1e-6) -> pl.Expr:
-#         """
-#         Checks if a column has equal distance between consecutive values.
-
-#         Parameters
-#         ----------
-#         tol
-#             Tolerance. If difference is all smaller (<=) than this, then true.
-#         """
-#         return (self._expr.diff(null_behavior="drop").abs() <= tol).all()
-
-#     def rel_entropy(self, other: pl.Expr) -> pl.Expr:
-#         """
-#         Computes relative entropy between self and other. (self = x, other = y).
-
-#         Parameters
-#         ----------
-#         other
-#             A Polars expression
-
-#         Reference
-#         ---------
-#         https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html
-#         """
-#         return (
-#             pl.when((self._expr > 0) & (other > 0))
-#             .then(self._expr * (self._expr / other).log())
-#             .when((self._expr == 0) & (other >= 0))
-#             .then(pl.lit(0.0, dtype=pl.Float64))
-#             .otherwise(pl.lit(float("inf"), dtype=pl.Float64))
-#         )
-
-#     def kl_div(self, other: pl.Expr) -> pl.Expr:
-#         """
-#         Computes Kullback-Leibler divergence between self and other. (self = x, other = y).
-
-#         Parameters
-#         ----------
-#         other
-#             A Polars expression
-
-#         Reference
-#         ---------
-#         https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.kl_div.html
-#         """
-#         return (
-#             pl.when((self._expr > 0) & (other > 0))
-#             .then(self._expr * (self._expr / other).log() - self._expr + other)
-#             .when((self._expr == 0) & (other >= 0))
-#             .then(other)
-#             .otherwise(pl.lit(float("inf"), dtype=pl.Float64))
-#         )
-
-
-# ----------------------------------------------------------------------------------
-
-
 def softmax(x: StrOrExpr) -> pl.Expr:
     """
     Applies the softmax function to the column, which turns any real valued column into valid probability

diff --git a/python/polars_ds/stats.py b/python/polars_ds/stats.py
@@ -16,6 +16,12 @@
 _lib = _get_shared_lib_location(__file__)
 
 __all__ = [
+    "query_cv",
+    "query_std_over_median",
+    "query_std_over_range",
+    "query_std_over_quantiles",
+    "query_longest_streak_above",
+    "query_longest_streak_below",
     "query_ttest_ind",
     "query_ttest_1samp",
     "query_ttest_ind_from_stats",
@@ -319,6 +325,121 @@ def rand_str(
 # -------------------------------------------------------------------------------------------------------
 
 
+def query_cv(x: StrOrExpr, ddof: int = 1) -> pl.Expr:
+    """
+    Returns the coefficient of variation for the variable. This is a shorthand for std / mean.
+
+    Parameters
+    ----------
+    x
+        The variable
+    ddof
+        The delta degree of frendom used in std computation
+    """
+    xx = str_to_expr(x)
+    return xx.std(ddof=ddof) / xx.mean()
+
+
+def query_std_over_median(x: StrOrExpr, ddof: int = 1) -> pl.Expr:
+    """
+    This is a shorthand for std / median.
+
+    Parameters
+    ----------
+    x
+        The variable
+    ddof
+        The delta degree of frendom used in std computation
+    """
+    xx = str_to_expr(x)
+    return xx.std(ddof=ddof) / xx.median()
+
+
+def query_std_over_range(x: StrOrExpr, ddof: int = 1) -> pl.Expr:
+    """
+    Standard deviation over the range of the variable. This is a shorthand for std / (max - min)
+
+    Parameters
+    ----------
+    x
+        The variable
+    ddof
+        The delta degree of frendom used in std computation
+    """
+    xx = str_to_expr(x)
+    return xx.std(ddof=ddof) / (xx.max() - xx.min())
+
+
+def query_std_over_quantiles(
+    x: StrOrExpr, ddof: int = 1, q1: float = 0.25, q2: float = 0.75
+) -> pl.Expr:
+    """
+    A more robust version of std over range, where range is replaced by quantiles q1 and q2.
+
+    Parameters
+    ----------
+    x
+        The variable
+    ddof
+        The delta degree of frendom used in std computation
+    q1
+        The lower quantile
+    q2
+        The higher quantile
+    """
+    if q1 >= 1.0 or q1 <= 0.0 or q2 >= 1.0 or q2 <= 0.0 or q1 >= q2:
+        raise ValueError("The quantiles q1, q2 must be within (0, 1) and q2 must be > q1.")
+
+    xx = str_to_expr(x)
+    return xx.std(ddof=ddof) / (xx.quantile(q2) - xx.quantile(q1))
+
+
+def query_longest_streak_above(x: StrOrExpr, value: Union[float, pl.Expr]) -> pl.Expr:
+    """
+    Finds the longest streak above (>=) a value.
+
+    Parameters
+    ----------
+    x
+        The variable
+    value
+        Either a float or a scalar polars expression. Output will be wrong is value is not a scalar.
+    """
+    s: pl.Expr = str_to_expr(x)
+    v: pl.Expr = pl.lit(value) if isinstance(value, float) else value
+    y = (s >= v).rle()
+    return (
+        y.filter(y.struct.field("values"))
+        .struct.field("lengths")
+        .max()
+        .fill_null(0)
+        .alias("longest_streak_above")
+    )
+
+
+def query_longest_streak_below(x: StrOrExpr, value: Union[float, pl.Expr]) -> pl.Expr:
+    """
+    Finds the longest streak below (<=) a value.
+
+    Parameters
+    ----------
+    x
+        The variable
+    value
+        Either a float or a scalar polars expression. Output will be wrong is value is not a scalar.
+    """
+    s: pl.Expr = str_to_expr(x)
+    v: pl.Expr = pl.lit(value) if isinstance(value, float) else value
+    y = (s <= v).rle()
+    return (
+        y.filter(y.struct.field("values"))
+        .struct.field("lengths")
+        .max()
+        .fill_null(0)
+        .alias("longest_streak_below")
+    )
+
+
 def query_first_digit_cnt(var: StrOrExpr) -> pl.Expr:
     """
     Finds the first digit count in the data. This is closely related to Benford's law,