alteryx · dvreed77 · Jul 19, 2022 · Jul 19, 2022 · Jul 19, 2022 · Jul 19, 2022
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -24,6 +24,7 @@ v1.12.0 Jul 19, 2022
         * Add ``IsWorkingHours`` and ``IsLunchTime`` transform primitives (:pr:`2130`)
         * Add periods parameter to ``Diff`` and add ``DiffDatetime`` primitive (:pr:`2155`)
         * Add ``RollingTrend`` primitive (:pr:`2170`)
+        * Restructured all primitives into separate files (:pr:`2187`)
     * Fixes
         * Resolves Woodwork integration test failure and removes Python version check for codecov (:pr:`2182`)
     * Changes

diff --git a/featuretools/feature_base/feature_base.py b/featuretools/feature_base/feature_base.py
@@ -5,7 +5,7 @@
 from featuretools.entityset.relationship import Relationship, RelationshipPath
 from featuretools.entityset.timedelta import Timedelta
 from featuretools.feature_base.utils import is_valid_input
-from featuretools.primitives.base import (
+from featuretools.primitives.core import (
     AggregationPrimitive,
     PrimitiveBase,
     TransformPrimitive,

diff --git a/featuretools/primitives/__init__.py b/featuretools/primitives/__init__.py
@@ -4,7 +4,17 @@
 import pkg_resources
 import traceback
 
-from featuretools.primitives.api import *  # noqa: F403
+from featuretools.primitives.aggregation import *
+from featuretools.primitives.core import *
+from featuretools.primitives.transform import *
+from featuretools.primitives.utils import (
+    get_aggregation_primitives,
+    get_default_aggregation_primitives,
+    get_default_transform_primitives,
+    get_transform_primitives,
+    list_primitives,
+    summarize_primitives,
+)
 
 
 def _load_primitives():

diff --git a/featuretools/primitives/aggregation/__init__.py b/featuretools/primitives/aggregation/__init__.py
@@ -0,0 +1 @@
+from .standard import *
diff --git a/featuretools/primitives/aggregation/api.py b/featuretools/primitives/aggregation/api.py
@@ -0,0 +1,7 @@
+from featuretools.primitives.aggregation import *
+from warnings import warn
+
+warn(
+    "featuretools.base module will become deprecated. Use featuretools.core instead",
+    Warning,
+)
diff --git a/featuretools/primitives/aggregation/standard/__init__.py b/featuretools/primitives/aggregation/standard/__init__.py
@@ -0,0 +1,47 @@
+from .all import All
+from .count import Count
+from .any import Any
+from .avg_time_between import AvgTimeBetween
+from .entropy import Entropy
+from .first import First
+from .last import Last
+from .max import Max
+from .mean import Mean
+from .median import Median
+from .min import Min
+from .mode import Mode
+from .n_most_common import NMostCommon
+from .num_true import NumTrue
+from .num_unique import NumUnique
+from .percent_true import PercentTrue
+from .skew import Skew
+from .std import Std
+from .sum import Sum
+from .time_since_first import TimeSinceFirst
+from .time_since_last import TimeSinceLast
+from .trend import Trend
+
+__all__ = [
+    "All",
+    "Count",
+    "Any",
+    "AvgTimeBetween",
+    "Entropy",
+    "First",
+    "Last",
+    "Max",
+    "Mean",
+    "Median",
+    "Min",
+    "Mode",
+    "NMostCommon",
+    "NumTrue",
+    "NumUnique",
+    "PercentTrue",
+    "Skew",
+    "Std",
+    "Sum",
+    "TimeSinceFirst",
+    "TimeSinceLast",
+    "Trend",
+]
diff --git a/featuretools/primitives/aggregation/standard/all.py b/featuretools/primitives/aggregation/standard/all.py
@@ -0,0 +1,44 @@
+import numpy as np
+from dask import dataframe as dd
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import Boolean, BooleanNullable
+
+from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
+from featuretools.utils.gen_utils import Library
+
+
+class All(AggregationPrimitive):
+    """Calculates if all values are 'True' in a list.
+
+    Description:
+        Given a list of booleans, return `True` if all
+        of the values are `True`.
+
+    Examples:
+        >>> all = All()
+        >>> all([False, False, False, True])
+        False
+    """
+
+    name = "all"
+    input_types = [
+        [ColumnSchema(logical_type=Boolean)],
+        [ColumnSchema(logical_type=BooleanNullable)],
+    ]
+    return_type = ColumnSchema(logical_type=Boolean)
+    stack_on_self = False
+    compatibility = [Library.PANDAS, Library.DASK]
+    description_template = "whether all of {} are true"
+
+    def get_function(self, agg_type=Library.PANDAS):
+        if agg_type == Library.DASK:
+
+            def chunk(s):
+                return s.agg(np.all)
+
+            def agg(s):
+                return s.agg(np.all)
+
+            return dd.Aggregation(self.name, chunk=chunk, agg=agg)
+
+        return np.all
diff --git a/featuretools/primitives/aggregation/standard/any.py b/featuretools/primitives/aggregation/standard/any.py
@@ -0,0 +1,44 @@
+import numpy as np
+from dask import dataframe as dd
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import Boolean, BooleanNullable
+
+from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
+from featuretools.utils.gen_utils import Library
+
+
+class Any(AggregationPrimitive):
+    """Determines if any value is 'True' in a list.
+
+    Description:
+        Given a list of booleans, return `True` if one or
+        more of the values are `True`.
+
+    Examples:
+        >>> any = Any()
+        >>> any([False, False, False, True])
+        True
+    """
+
+    name = "any"
+    input_types = [
+        [ColumnSchema(logical_type=Boolean)],
+        [ColumnSchema(logical_type=BooleanNullable)],
+    ]
+    return_type = ColumnSchema(logical_type=Boolean)
+    stack_on_self = False
+    compatibility = [Library.PANDAS, Library.DASK]
+    description_template = "whether any of {} are true"
+
+    def get_function(self, agg_type=Library.PANDAS):
+        if agg_type == Library.DASK:
+
+            def chunk(s):
+                return s.agg(np.any)
+
+            def agg(s):
+                return s.agg(np.any)
+
+            return dd.Aggregation(self.name, chunk=chunk, agg=agg)
+
+        return np.any
diff --git a/featuretools/primitives/aggregation/standard/avg_time_between.py b/featuretools/primitives/aggregation/standard/avg_time_between.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import Datetime, Double
+
+from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
+from featuretools.utils import convert_time_units
+from featuretools.utils.gen_utils import Library
+
+
+class AvgTimeBetween(AggregationPrimitive):
+    """Computes the average number of seconds between consecutive events.
+
+    Description:
+        Given a list of datetimes, return the average time (default in seconds)
+        elapsed between consecutive events. If there are fewer
+        than 2 non-null values, return `NaN`.
+
+    Args:
+        unit (str): Defines the unit of time.
+            Defaults to seconds. Acceptable values:
+            years, months, days, hours, minutes, seconds, milliseconds, nanoseconds
+
+    Examples:
+        >>> from datetime import datetime
+        >>> avg_time_between = AvgTimeBetween()
+        >>> times = [datetime(2010, 1, 1, 11, 45, 0),
+        ...          datetime(2010, 1, 1, 11, 55, 15),
+        ...          datetime(2010, 1, 1, 11, 57, 30)]
+        >>> avg_time_between(times)
+        375.0
+        >>> avg_time_between = AvgTimeBetween(unit="minutes")
+        >>> avg_time_between(times)
+        6.25
+    """
+
+    name = "avg_time_between"
+    input_types = [ColumnSchema(logical_type=Datetime, semantic_tags={"time_index"})]
+    return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
+    description_template = "the average time between each of {}"
+
+    def __init__(self, unit="seconds"):
+        self.unit = unit.lower()
+
+    def get_function(self, agg_type=Library.PANDAS):
+        def pd_avg_time_between(x):
+            """Assumes time scales are closer to order
+            of seconds than to nanoseconds
+            if times are much closer to nanoseconds
+            we could get some floating point errors
+
+            this can be fixed with another function
+            that calculates the mean before converting
+            to seconds
+            """
+            x = x.dropna()
+            if x.shape[0] < 2:
+                return np.nan
+            if isinstance(x.iloc[0], (pd.Timestamp, datetime)):
+                x = x.view("int64")
+                # use len(x)-1 because we care about difference
+                # between values, len(x)-1 = len(diff(x))
+
+            avg = (x.max() - x.min()) / (len(x) - 1)
+            avg = avg * 1e-9
+
+            # long form:
+            # diff_in_ns = x.diff().iloc[1:].astype('int64')
+            # diff_in_seconds = diff_in_ns * 1e-9
+            # avg = diff_in_seconds.mean()
+            return convert_time_units(avg, self.unit)
+
+        return pd_avg_time_between
diff --git a/featuretools/primitives/aggregation/standard/count.py b/featuretools/primitives/aggregation/standard/count.py
@@ -0,0 +1,40 @@
+import pandas as pd
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import IntegerNullable
+
+from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
+from featuretools.utils.gen_utils import Library
+
+
+class Count(AggregationPrimitive):
+    """Determines the total number of values, excluding `NaN`.
+
+    Examples:
+        >>> count = Count()
+        >>> count([1, 2, 3, 4, 5, None])
+        5
+    """
+
+    name = "count"
+    input_types = [ColumnSchema(semantic_tags={"index"})]
+    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
+    stack_on_self = False
+    default_value = 0
+    compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
+    description_template = "the number"
+
+    def get_function(self, agg_type=Library.PANDAS):
+        if agg_type in [Library.DASK, Library.SPARK]:
+            return "count"
+
+        return pd.Series.count
+
+    def generate_name(
+        self,
+        base_feature_names,
+        relationship_path_name,
+        parent_dataframe_name,
+        where_str,
+        use_prev_str,
+    ):
+        return "COUNT(%s%s%s)" % (relationship_path_name, where_str, use_prev_str)
diff --git a/featuretools/primitives/aggregation/standard/entropy.py b/featuretools/primitives/aggregation/standard/entropy.py
@@ -0,0 +1,44 @@
+from scipy import stats
+from woodwork.column_schema import ColumnSchema
+
+from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
+from featuretools.utils.gen_utils import Library
+
+
+class Entropy(AggregationPrimitive):
+    """Calculates the entropy for a categorical column
+
+    Description:
+        Given a list of observations from a categorical
+        column return the entropy of the distribution.
+        NaN values can be treated as a category or
+        dropped.
+
+    Args:
+        dropna (bool): Whether to consider NaN values as a separate category
+            Defaults to False.
+        base (float): The logarithmic base to use
+            Defaults to e (natural logarithm)
+
+    Examples:
+        >>> pd_entropy = Entropy()
+        >>> pd_entropy([1,2,3,4])
+        1.3862943611198906
+    """
+
+    name = "entropy"
+    input_types = [ColumnSchema(semantic_tags={"category"})]
+    return_type = ColumnSchema(semantic_tags={"numeric"})
+    stack_on_self = False
+    description_template = "the entropy of {}"
+
+    def __init__(self, dropna=False, base=None):
+        self.dropna = dropna
+        self.base = base
+
+    def get_function(self, agg_type=Library.PANDAS):
+        def pd_entropy(s):
+            distribution = s.value_counts(normalize=True, dropna=self.dropna)
+            return stats.entropy(distribution, base=self.base)
+
+        return pd_entropy
diff --git a/featuretools/primitives/aggregation/standard/first.py b/featuretools/primitives/aggregation/standard/first.py
@@ -0,0 +1,26 @@
+from woodwork.column_schema import ColumnSchema
+
+from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
+from featuretools.utils.gen_utils import Library
+
+
+class First(AggregationPrimitive):
+    """Determines the first value in a list.
+
+    Examples:
+        >>> first = First()
+        >>> first([1, 2, 3, 4, 5, None])
+        1.0
+    """
+
+    name = "first"
+    input_types = [ColumnSchema()]
+    return_type = None
+    stack_on_self = False
+    description_template = "the first instance of {}"
+
+    def get_function(self, agg_type=Library.PANDAS):
+        def pd_first(x):
+            return x.iloc[0]
+
+        return pd_first