Merge pull request #73 from abstractqqq/use_pool

abstractqqq · Feb 9, 2024 · 7c97165 · 7c97165
2 parents 82f2760 + b394878
commit 7c97165
Show file tree

Hide file tree

Showing 28 changed files with 1,807 additions and 1,082 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -25,7 +25,7 @@ realfft = "3.3.0"
 rapidfuzz = "0.5.0"
 inflections = "1.1.1"
 kdtree = {git = "https://github.com/mrhooray/kdtree-rs.git"}
-pathfinding = "4.8.2"
+petgraph = "0.6.4"
 ordered-float = "4.2.0"
 
 [target.'cfg(any(not(target_os = "linux"), use_mimalloc))'.dependencies]

diff --git a/README.md b/README.md
@@ -226,6 +226,8 @@ shape: (5, 3)
 
 This package is not tested with Polars streaming mode and is not designed to work with data so big that has to be streamed. The recommended usage will be for datasets of size 1k to 2-3mm rows. Performance will only be a priority for datasets within this size.
 
+Str-knn and Graph queries are suitable for smaller data, of size ~1-5k for common computers.
+
 # Credits
 
 1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See [here](https://github.com/tsoding/seroost)

diff --git a/examples/basics.ipynb b/examples/basics.ipynb
diff --git a/python/polars_ds/graph.py b/python/polars_ds/graph.py
@@ -11,9 +11,15 @@ class GraphExt:
     """
     This class contains tools for working with graphs inside a dataframe. Graphs are represented by two columns:
     one node column (index, u64), which is usually implicit, and one edge column (list[u64]). Most algorithms
-    here implicitly assumes the nodes are indexed by row number starting from 0.
+    here implicitly assumes the nodes are indexed by row number starting from 0. On row 5, say the value of the
+    column of edges is [1,6,13], that means we have a connection from 5 to 1, 5 to 6, and 5 to 13.
 
-    This module will only focus on undirected graph for now.
+    This module will only focus on undirected graph for now. Also note that this module is very slow and is transcient,
+    meaning that each query will create a graph and use it only for the duration of the query. It does not persist
+    the graph. So it will be very slow if you are running multiple graph methods. This module mainly provides the
+    convenience. As a result, a lot of graph algorithms will not be implemented and won't have the most versatile API.
+
+    To be decided: a separate, dedicated Graph module might be appropriate.
 
     Polars Namespace: graph
 
@@ -25,10 +31,31 @@ def __init__(self, expr: pl.Expr):
 
     def deg(self) -> pl.Expr:
         """
-        Treat self as a column of "edges" and return the degree of each node in self. Note that this
-        is simply an alias of `pl.col("edges").list.len()`.
+        Computes degree of each node. This will treat self as a column of "edges" and considers the graph undirected.
+
+        Note that this will not sort the nodes for the user. This assumes that nodes
+        are indexed by the natural numbers: 0, 1, ... If nodes are not sorted or if the u64 in edge list does not refer to
+        the node's index, the result may be incorrect or may throw an error.
+        """
+        return self._expr.register_plugin(
+            lib=_lib,
+            symbol="pl_graph_deg",
+            is_elementwise=True,
+        )
+
+    def in_out_deg(self) -> pl.Expr:
+        """
+        Computes in and out degree of each node. This will treat self as a column of "edges" and considers the graph directed.
+
+        Note that this will not sort the nodes for the user. This assumes that nodes
+        are indexed by the natural numbers: 0, 1, ... If nodes are not sorted or if the u64 in edge list does not refer to
+        the node's index, the result may be incorrect or may throw an error.
         """
-        return self._expr.list.len()
+        return self._expr.register_plugin(
+            lib=_lib,
+            symbol="pl_graph_in_out_deg",
+            is_elementwise=True,
+        )
 
     def eigen_centrality(self, n_iter: int = 15, normalize: bool = True) -> pl.Expr:
         """
@@ -61,8 +88,6 @@ def shortest_path_const_cost(
         Treats self as a column of "edges" and computes the shortest path to the target assuming that
         the cost of traveling every edge is constant.
 
-        Self must be a column of list[u64].
-
         Note that this will not sort the nodes for the user. This assumes that nodes are indexed by the
         row numbers: 0, 1, ...
 
@@ -94,9 +119,8 @@ def shortest_path(
     ) -> pl.Expr:
         """
         Treats self as a column of "edges" and computes the shortest path to the target by using the
-        cost provided.
-
-        Self must be a column of list[u64].
+        cost provided. This will treat the graph as a directed graph, and the edge (i, j) may have different
+        cost than (j, i), depending on the data.
 
         Note that this will not sort the nodes for the user. This assumes that nodes are indexed by the
         row numbers: 0, 1, ...

diff --git a/python/polars_ds/str2.py b/python/polars_ds/str2.py
@@ -177,9 +177,7 @@ def merge_infreq(
             .otherwise(self._expr)
         )
 
-    def fuzz(
-        self, other: Union[str, pl.Expr], cutoff: Optional[float] = None, parallel: bool = False
-    ) -> pl.Expr:
+    def fuzz(self, other: Union[str, pl.Expr], parallel: bool = False) -> pl.Expr:
         """
         A string similarity based on Longest Common Subsequence.
 
@@ -189,8 +187,6 @@ def fuzz(
             If this is a string, then the entire column will be compared with this string. If this
             is an expression, then perform element-wise fuzz computation between this column
             and the other (given by the expression).
-        cutoff
-            If this is provided, it has to be between 0 and 1. If sim < cutoff, then Null will be returned.
         parallel
             Whether to run the comparisons in parallel. Note that this is not always faster, especially
             when used with other expressions or in group_by/over context.
@@ -200,11 +196,10 @@ def fuzz(
         else:
             other_ = other
 
-        cutoff_ = pl.lit(cutoff, dtype=pl.Float64)
         return self._expr.register_plugin(
             lib=_lib,
             symbol="pl_fuzz",
-            args=[other_, cutoff_, pl.lit(parallel, pl.Boolean)],
+            args=[other_, pl.lit(parallel, pl.Boolean)],
             is_elementwise=True,
         )
 
@@ -540,19 +535,28 @@ def hamming(
         else:
             other_ = other
 
-        return self._expr.register_plugin(
-            lib=_lib,
-            symbol="pl_hamming",
-            args=[other_, pl.lit(pad, pl.Boolean), pl.lit(parallel, pl.Boolean)],
-            is_elementwise=True,
-        )
+        if pad:
+            return self._expr.register_plugin(
+                lib=_lib,
+                symbol="pl_hamming_padded",
+                args=[other_, pl.lit(parallel, pl.Boolean)],
+                is_elementwise=True,
+            )
+        else:
+            return self._expr.register_plugin(
+                lib=_lib,
+                symbol="pl_hamming",
+                args=[other_, pl.lit(parallel, pl.Boolean)],
+                is_elementwise=True,
+            )
 
     def hamming_filter(
         self, other: Union[str, pl.Expr], bound: int, pad: bool = False, parallel: bool = False
     ) -> pl.Expr:
         """
-        Returns whether the hamming distance between self and other is <= bound. This is much
-        faster than computing hamming distance and then doing a filter.
+        Returns whether the hamming distance between self and other is <= bound. This is
+        faster than computing hamming distance and then doing a filter. Note this does not pad
+        the strings. If the lengths of the two strings do not match, they will be filtered out.
 
         Parameters
         ----------
@@ -562,8 +566,6 @@ def hamming_filter(
             and the other (given by the expression) will be performed.
         bound
             Closed upper bound. If distance <= bound, return true and false otherwise.
-        pad
-            Whether to pad the string when lengths are not equal.
         parallel
             Whether to run the comparisons in parallel. Note that this is not always faster, especially
             when used with other expressions or in group_by/over context.
@@ -579,7 +581,6 @@ def hamming_filter(
             args=[
                 other_,
                 pl.lit(bound, dtype=pl.UInt32),
-                pl.lit(pad, pl.Boolean),
                 pl.lit(parallel, pl.Boolean),
             ],
             is_elementwise=True,
@@ -721,9 +722,9 @@ def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Ex
                 .register_plugin(
                     lib=_lib,
                     symbol="pl_snowball_stem",
-                    args=[pl.lit(True, pl.Boolean), pl.lit(False, pl.Boolean)],
+                    args=[pl.lit(True, pl.Boolean)],
                     is_elementwise=True,
-                )  # True to no stop word, False to Parallel
+                )  # True to no stop word
                 .drop_nulls()
             )
         return out
@@ -758,22 +759,21 @@ def freq_removal(
 
         return self._expr.list.set_difference(remove)
 
-    def snowball(self, no_stopwords: bool = True, parallel: bool = False) -> pl.Expr:
+    def snowball(self, no_stopwords: bool = True) -> pl.Expr:
         """
         Applies the snowball stemmer for the column. The column is supposed to be a column of single words.
+        Numbers will be stemmed to the empty string.
 
         Parameters
         ----------
         no_stopwords
-            If true, stopwords will be mapped to None. If false, stopwords will be stemmed.
-        parallel
-            Whether to run the comparisons in parallel. Note that this is not always faster, especially
-            when used with other expressions or in group_by/over context.
+            If true, stopwords will be mapped to the empty string. If false, stopwords will remain. Removing
+            stopwords may impact performance.
         """
         return self._expr.register_plugin(
             lib=_lib,
             symbol="pl_snowball_stem",
-            args=[pl.lit(no_stopwords, pl.Boolean), pl.lit(parallel, pl.Boolean)],
+            args=[pl.lit(no_stopwords, pl.Boolean)],
             is_elementwise=True,
         )
 
@@ -782,7 +782,6 @@ def to_camel_case(self) -> pl.Expr:
         return self._expr.register_plugin(
             lib=_lib,
             symbol="pl_to_camel",
-            args=[],
             is_elementwise=True,
         )
 
@@ -791,7 +790,6 @@ def to_snake_case(self) -> pl.Expr:
         return self._expr.register_plugin(
             lib=_lib,
             symbol="pl_to_snake",
-            args=[],
             is_elementwise=True,
         )
 
@@ -800,7 +798,6 @@ def to_pascal_case(self) -> pl.Expr:
         return self._expr.register_plugin(
             lib=_lib,
             symbol="pl_to_pascal",
-            args=[],
             is_elementwise=True,
         )
 
@@ -809,6 +806,5 @@ def to_constant_case(self) -> pl.Expr:
         return self._expr.register_plugin(
             lib=_lib,
             symbol="pl_to_constant",
-            args=[],
             is_elementwise=True,
         )