Skip to content

Commit

Permalink
Merge pull request #182 from abstractqqq/time_series_features
Browse files Browse the repository at this point in the history
  • Loading branch information
abstractqqq authored Jun 13, 2024
2 parents aaeb89e + 252046d commit 5245d18
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 206 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ authors = [
{name = "Tianren Qin", email = "tq9695@gmail.com"},
]
dependencies = [
"polars >= 0.20.6, !=0.20.12",
"polars >= 0.20.6, !=0.20.12, <1.0",
'typing-extensions; python_version <= "3.11"',
]

Expand Down
193 changes: 0 additions & 193 deletions python/polars_ds/num.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,199 +56,6 @@
]


# @pl.api.register_expr_namespace("num")
# class NumExt:
# """
# This class contains tools for dealing with well-known numerical operations and other metrics inside Polars DataFrame.
# All the metrics/losses provided here is meant for use in cases like evaluating models outside training,
# not for actual use in ML models.

# Polars Namespace: num

# Example: pl.col("a").num.range_over_mean()
# """

# def __init__(self, expr: pl.Expr):
# self._expr: pl.Expr = expr

# def std_err(self, ddof: int = 1) -> pl.Expr:
# """
# Estimates the standard error for the mean of the expression.
# """
# return self._expr.std(ddof=ddof) / self._expr.count().sqrt()

# def std_over_range(self, ddof: int = 1) -> pl.Expr:
# """
# Computes the standard deviation over the range.
# """
# return self._expr.std(ddof=ddof) / (self._expr.max() - self._expr.min())

# def rms(self) -> pl.Expr:
# """
# Returns root mean square of the expression
# """
# return (self._expr.dot(self._expr) / self._expr.count()).sqrt()

# def cv(self, ddof: int = 1) -> pl.Expr:
# """
# Returns the coefficient of variation of the expression
# """
# return self._expr.std(ddof=ddof) / self._expr.mean()

# def yeo_johnson(self, lam: float) -> pl.Expr:
# """
# Performs the Yeo Johnson transform with parameters lambda.

# Unfortunately, the package does not provide estimate for lambda as of now.

# Parameters
# ----------
# lam
# The lambda in Yeo Johnson transform

# Reference
# ---------
# https://en.wikipedia.org/wiki/Power_transform
# """
# x = self._expr

# if lam == 0: # log(x + 1)
# x_ge = x.log1p()
# else: # ((x + 1)**lmbda - 1) / lmbda
# x_ge = ((1 + x).pow(lam) - 1) / lam

# if lam == 2: # -log(-x + 1)
# x_lt = pl.lit(-1) * (-x).log1p()
# else: # -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda)
# t = 2 - lam
# x_lt = -((1 - x).pow(t) - 1) / t

# return pl.when(x >= 0.0).then(x_ge).otherwise(x_lt)

# def box_cox(self, lam: float, lam2: float = 0.0) -> pl.Expr:
# """
# Performs the two-parameter Box Cox transform with parameters lambda. This
# transform is only valid for values >= -lam2. Every other value will be mapped to None.

# Unfortunately, the package does not provide estimate for lambda as of now.

# Parameters
# ----------
# lam
# The first lambda in Box Cox transform
# lam2
# The second lambda in Box Cox transform

# Reference
# ---------
# https://en.wikipedia.org/wiki/Power_transform
# """
# if lam2 == 0.0:
# x = self._expr
# cond = self._expr > 0
# else:
# x = self._expr + lam2
# cond = self._expr > -lam2

# if lam == 0.0:
# return pl.when(cond).then(x.log()).otherwise(None)
# else:
# return pl.when(cond).then((x.pow(lam) - 1) / lam).otherwise(None)

# def max_abs(self) -> pl.Expr:
# """
# Returns the maximum of absolute values of self.
# """
# return pl.max_horizontal(self._expr.max().abs(), self._expr.min().abs())

# def n_bins(self, n: int) -> pl.Expr:
# """
# Maps values in this series into n bins, with each bin having equal size. This ensures that
# the bins' ranges are the same, unlike quantiles. This may have tiny numerical errors but
# should be tolerable.

# Parameters
# ----------
# n
# Any positive integer
# """
# if n <= 0:
# raise ValueError("Input `n` must be positive.")

# x = self._expr
# return (
# (x - x.min()).floordiv(pl.lit(1e-12) + (x.max() - x.min()) / pl.lit(n)).cast(pl.UInt32)
# )

# def count_max(self) -> pl.Expr:
# """
# Count the number of occurrences of max.
# """
# return (self._expr == self._expr.max()).sum()

# def count_min(self) -> pl.Expr:
# """
# Count the number of occurrences of min.
# """
# return (self._expr == self._expr.min()).sum()

# def is_equidistant(self, tol: float = 1e-6) -> pl.Expr:
# """
# Checks if a column has equal distance between consecutive values.

# Parameters
# ----------
# tol
# Tolerance. If difference is all smaller (<=) than this, then true.
# """
# return (self._expr.diff(null_behavior="drop").abs() <= tol).all()

# def rel_entropy(self, other: pl.Expr) -> pl.Expr:
# """
# Computes relative entropy between self and other. (self = x, other = y).

# Parameters
# ----------
# other
# A Polars expression

# Reference
# ---------
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html
# """
# return (
# pl.when((self._expr > 0) & (other > 0))
# .then(self._expr * (self._expr / other).log())
# .when((self._expr == 0) & (other >= 0))
# .then(pl.lit(0.0, dtype=pl.Float64))
# .otherwise(pl.lit(float("inf"), dtype=pl.Float64))
# )

# def kl_div(self, other: pl.Expr) -> pl.Expr:
# """
# Computes Kullback-Leibler divergence between self and other. (self = x, other = y).

# Parameters
# ----------
# other
# A Polars expression

# Reference
# ---------
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.kl_div.html
# """
# return (
# pl.when((self._expr > 0) & (other > 0))
# .then(self._expr * (self._expr / other).log() - self._expr + other)
# .when((self._expr == 0) & (other >= 0))
# .then(other)
# .otherwise(pl.lit(float("inf"), dtype=pl.Float64))
# )


# ----------------------------------------------------------------------------------


def softmax(x: StrOrExpr) -> pl.Expr:
"""
Applies the softmax function to the column, which turns any real valued column into valid probability
Expand Down
121 changes: 121 additions & 0 deletions python/polars_ds/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
_lib = _get_shared_lib_location(__file__)

__all__ = [
"query_cv",
"query_std_over_median",
"query_std_over_range",
"query_std_over_quantiles",
"query_longest_streak_above",
"query_longest_streak_below",
"query_ttest_ind",
"query_ttest_1samp",
"query_ttest_ind_from_stats",
Expand Down Expand Up @@ -319,6 +325,121 @@ def rand_str(
# -------------------------------------------------------------------------------------------------------


def query_cv(x: StrOrExpr, ddof: int = 1) -> pl.Expr:
"""
Returns the coefficient of variation for the variable. This is a shorthand for std / mean.
Parameters
----------
x
The variable
ddof
The delta degree of frendom used in std computation
"""
xx = str_to_expr(x)
return xx.std(ddof=ddof) / xx.mean()


def query_std_over_median(x: StrOrExpr, ddof: int = 1) -> pl.Expr:
"""
This is a shorthand for std / median.
Parameters
----------
x
The variable
ddof
The delta degree of frendom used in std computation
"""
xx = str_to_expr(x)
return xx.std(ddof=ddof) / xx.median()


def query_std_over_range(x: StrOrExpr, ddof: int = 1) -> pl.Expr:
"""
Standard deviation over the range of the variable. This is a shorthand for std / (max - min)
Parameters
----------
x
The variable
ddof
The delta degree of frendom used in std computation
"""
xx = str_to_expr(x)
return xx.std(ddof=ddof) / (xx.max() - xx.min())


def query_std_over_quantiles(
x: StrOrExpr, ddof: int = 1, q1: float = 0.25, q2: float = 0.75
) -> pl.Expr:
"""
A more robust version of std over range, where range is replaced by quantiles q1 and q2.
Parameters
----------
x
The variable
ddof
The delta degree of frendom used in std computation
q1
The lower quantile
q2
The higher quantile
"""
if q1 >= 1.0 or q1 <= 0.0 or q2 >= 1.0 or q2 <= 0.0 or q1 >= q2:
raise ValueError("The quantiles q1, q2 must be within (0, 1) and q2 must be > q1.")

xx = str_to_expr(x)
return xx.std(ddof=ddof) / (xx.quantile(q2) - xx.quantile(q1))


def query_longest_streak_above(x: StrOrExpr, value: Union[float, pl.Expr]) -> pl.Expr:
"""
Finds the longest streak above (>=) a value.
Parameters
----------
x
The variable
value
Either a float or a scalar polars expression. Output will be wrong is value is not a scalar.
"""
s: pl.Expr = str_to_expr(x)
v: pl.Expr = pl.lit(value) if isinstance(value, float) else value
y = (s >= v).rle()
return (
y.filter(y.struct.field("values"))
.struct.field("lengths")
.max()
.fill_null(0)
.alias("longest_streak_above")
)


def query_longest_streak_below(x: StrOrExpr, value: Union[float, pl.Expr]) -> pl.Expr:
"""
Finds the longest streak below (<=) a value.
Parameters
----------
x
The variable
value
Either a float or a scalar polars expression. Output will be wrong is value is not a scalar.
"""
s: pl.Expr = str_to_expr(x)
v: pl.Expr = pl.lit(value) if isinstance(value, float) else value
y = (s <= v).rle()
return (
y.filter(y.struct.field("values"))
.struct.field("lengths")
.max()
.fill_null(0)
.alias("longest_streak_below")
)


def query_first_digit_cnt(var: StrOrExpr) -> pl.Expr:
"""
Finds the first digit count in the data. This is closely related to Benford's law,
Expand Down
Loading

0 comments on commit 5245d18

Please sign in to comment.