Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2179 Restructure Directories #2187

Closed
wants to merge 33 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d3b84a0
initial commit
dvreed77 Jul 19, 2022
7c7bbaa
added transform primitives
dvreed77 Jul 19, 2022
251f219
added binary transforms
dvreed77 Jul 19, 2022
e61c066
added cumulative transforms
dvreed77 Jul 19, 2022
d3adf29
added datetime transforms
dvreed77 Jul 19, 2022
7a933fc
added rolling
dvreed77 Jul 19, 2022
78bf8d2
fixed utils
dvreed77 Jul 19, 2022
a972a0c
moved cum transforms
dvreed77 Jul 19, 2022
bc72a8a
trying to reconcile imports
dvreed77 Jul 19, 2022
b4f21b0
added imports to init.py
dvreed77 Jul 19, 2022
96c4a97
fixed a bunch of imports
dvreed77 Jul 21, 2022
6424429
fixed imports
dvreed77 Jul 21, 2022
1d39f89
fixed imports
dvreed77 Jul 21, 2022
3a94143
fixed test
dvreed77 Jul 21, 2022
3541baf
fixed some doc tests
dvreed77 Jul 21, 2022
595a629
fixed release notes
dvreed77 Jul 21, 2022
1dc8fe6
fixed release notes
dvreed77 Jul 21, 2022
669d567
Merge remote-tracking branch 'origin/main' into iss2179-restructure-ft
dvreed77 Jul 21, 2022
0dd5785
fixed test
dvreed77 Jul 21, 2022
e8dfa6c
removed unused imports
dvreed77 Jul 21, 2022
ea57232
fixed imports
dvreed77 Jul 21, 2022
99fe286
added __all__
dvreed77 Jul 21, 2022
15773bb
added __all__
dvreed77 Jul 21, 2022
99aafe2
lint fix
dvreed77 Jul 21, 2022
a75c06f
fixed tests
dvreed77 Jul 22, 2022
c1e9343
fixed import
dvreed77 Jul 22, 2022
46f7de1
added deprecation warning
dvreed77 Jul 22, 2022
9334ed3
fix all flake8 issues
ozzieD Jul 22, 2022
6f38bbb
Merge remote-tracking branch 'origin/iss2179-restructure-ft' into iss…
ozzieD Jul 22, 2022
128b7f1
make utils for geo and datetime
ozzieD Jul 22, 2022
4c3bd56
fix rolling tests
ozzieD Jul 22, 2022
b44205d
fixed rolling primitives test
dvreed77 Jul 22, 2022
c38e999
adding back old modules with warnings
dvreed77 Jul 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -24,6 +24,7 @@ v1.12.0 Jul 19, 2022
* Add ``IsWorkingHours`` and ``IsLunchTime`` transform primitives (:pr:`2130`)
* Add periods parameter to ``Diff`` and add ``DiffDatetime`` primitive (:pr:`2155`)
* Add ``RollingTrend`` primitive (:pr:`2170`)
* Restructured all primitives into separate files (:pr:`2187`)
* Fixes
* Resolves Woodwork integration test failure and removes Python version check for codecov (:pr:`2182`)
* Changes
Expand Down
2 changes: 1 addition & 1 deletion featuretools/feature_base/feature_base.py
Expand Up @@ -5,7 +5,7 @@
from featuretools.entityset.relationship import Relationship, RelationshipPath
from featuretools.entityset.timedelta import Timedelta
from featuretools.feature_base.utils import is_valid_input
from featuretools.primitives.base import (
from featuretools.primitives.core import (
AggregationPrimitive,
PrimitiveBase,
TransformPrimitive,
Expand Down
12 changes: 11 additions & 1 deletion featuretools/primitives/__init__.py
Expand Up @@ -4,7 +4,17 @@
import pkg_resources
import traceback

from featuretools.primitives.api import * # noqa: F403
from featuretools.primitives.aggregation import *
from featuretools.primitives.core import *
from featuretools.primitives.transform import *
from featuretools.primitives.utils import (
get_aggregation_primitives,
get_default_aggregation_primitives,
get_default_transform_primitives,
get_transform_primitives,
list_primitives,
summarize_primitives,
)


def _load_primitives():
Expand Down
1 change: 1 addition & 0 deletions featuretools/primitives/aggregation/__init__.py
@@ -0,0 +1 @@
from .standard import *
7 changes: 7 additions & 0 deletions featuretools/primitives/aggregation/api.py
@@ -0,0 +1,7 @@
from featuretools.primitives.aggregation import *
from warnings import warn

warn(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this? Users have to change their import statements just because we re-organized directories?
I don't think that is ideal

Copy link
Contributor Author

@dvreed77 dvreed77 Jul 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only if they happen to be importing that far down from featuretools.primitives.aggregation.api

"featuretools.base module will become deprecated. Use featuretools.core instead",
Warning,
)
47 changes: 47 additions & 0 deletions featuretools/primitives/aggregation/standard/__init__.py
@@ -0,0 +1,47 @@
from .all import All
from .count import Count
from .any import Any
from .avg_time_between import AvgTimeBetween
from .entropy import Entropy
from .first import First
from .last import Last
from .max import Max
from .mean import Mean
from .median import Median
from .min import Min
from .mode import Mode
from .n_most_common import NMostCommon
from .num_true import NumTrue
from .num_unique import NumUnique
from .percent_true import PercentTrue
from .skew import Skew
from .std import Std
from .sum import Sum
from .time_since_first import TimeSinceFirst
from .time_since_last import TimeSinceLast
from .trend import Trend

__all__ = [
"All",
"Count",
"Any",
"AvgTimeBetween",
"Entropy",
"First",
"Last",
"Max",
"Mean",
"Median",
"Min",
"Mode",
"NMostCommon",
"NumTrue",
"NumUnique",
"PercentTrue",
"Skew",
"Std",
"Sum",
"TimeSinceFirst",
"TimeSinceLast",
"Trend",
]
44 changes: 44 additions & 0 deletions featuretools/primitives/aggregation/standard/all.py
@@ -0,0 +1,44 @@
import numpy as np
from dask import dataframe as dd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Boolean, BooleanNullable

from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
from featuretools.utils.gen_utils import Library


class All(AggregationPrimitive):
"""Calculates if all values are 'True' in a list.

Description:
Given a list of booleans, return `True` if all
of the values are `True`.

Examples:
>>> all = All()
>>> all([False, False, False, True])
False
"""

name = "all"
input_types = [
[ColumnSchema(logical_type=Boolean)],
[ColumnSchema(logical_type=BooleanNullable)],
]
return_type = ColumnSchema(logical_type=Boolean)
stack_on_self = False
compatibility = [Library.PANDAS, Library.DASK]
description_template = "whether all of {} are true"

def get_function(self, agg_type=Library.PANDAS):
if agg_type == Library.DASK:

def chunk(s):
return s.agg(np.all)

def agg(s):
return s.agg(np.all)

return dd.Aggregation(self.name, chunk=chunk, agg=agg)

return np.all
44 changes: 44 additions & 0 deletions featuretools/primitives/aggregation/standard/any.py
@@ -0,0 +1,44 @@
import numpy as np
from dask import dataframe as dd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Boolean, BooleanNullable

from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
from featuretools.utils.gen_utils import Library


class Any(AggregationPrimitive):
"""Determines if any value is 'True' in a list.

Description:
Given a list of booleans, return `True` if one or
more of the values are `True`.

Examples:
>>> any = Any()
>>> any([False, False, False, True])
True
"""

name = "any"
input_types = [
[ColumnSchema(logical_type=Boolean)],
[ColumnSchema(logical_type=BooleanNullable)],
]
return_type = ColumnSchema(logical_type=Boolean)
stack_on_self = False
compatibility = [Library.PANDAS, Library.DASK]
description_template = "whether any of {} are true"

def get_function(self, agg_type=Library.PANDAS):
if agg_type == Library.DASK:

def chunk(s):
return s.agg(np.any)

def agg(s):
return s.agg(np.any)

return dd.Aggregation(self.name, chunk=chunk, agg=agg)

return np.any
75 changes: 75 additions & 0 deletions featuretools/primitives/aggregation/standard/avg_time_between.py
@@ -0,0 +1,75 @@
from datetime import datetime

import numpy as np
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Datetime, Double

from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
from featuretools.utils import convert_time_units
from featuretools.utils.gen_utils import Library


class AvgTimeBetween(AggregationPrimitive):
"""Computes the average number of seconds between consecutive events.

Description:
Given a list of datetimes, return the average time (default in seconds)
elapsed between consecutive events. If there are fewer
than 2 non-null values, return `NaN`.

Args:
unit (str): Defines the unit of time.
Defaults to seconds. Acceptable values:
years, months, days, hours, minutes, seconds, milliseconds, nanoseconds

Examples:
>>> from datetime import datetime
>>> avg_time_between = AvgTimeBetween()
>>> times = [datetime(2010, 1, 1, 11, 45, 0),
... datetime(2010, 1, 1, 11, 55, 15),
... datetime(2010, 1, 1, 11, 57, 30)]
>>> avg_time_between(times)
375.0
>>> avg_time_between = AvgTimeBetween(unit="minutes")
>>> avg_time_between(times)
6.25
"""

name = "avg_time_between"
input_types = [ColumnSchema(logical_type=Datetime, semantic_tags={"time_index"})]
return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
description_template = "the average time between each of {}"

def __init__(self, unit="seconds"):
self.unit = unit.lower()

def get_function(self, agg_type=Library.PANDAS):
def pd_avg_time_between(x):
"""Assumes time scales are closer to order
of seconds than to nanoseconds
if times are much closer to nanoseconds
we could get some floating point errors

this can be fixed with another function
that calculates the mean before converting
to seconds
"""
x = x.dropna()
if x.shape[0] < 2:
return np.nan
if isinstance(x.iloc[0], (pd.Timestamp, datetime)):
x = x.view("int64")
# use len(x)-1 because we care about difference
# between values, len(x)-1 = len(diff(x))

avg = (x.max() - x.min()) / (len(x) - 1)
avg = avg * 1e-9

# long form:
# diff_in_ns = x.diff().iloc[1:].astype('int64')
# diff_in_seconds = diff_in_ns * 1e-9
# avg = diff_in_seconds.mean()
return convert_time_units(avg, self.unit)

return pd_avg_time_between
40 changes: 40 additions & 0 deletions featuretools/primitives/aggregation/standard/count.py
@@ -0,0 +1,40 @@
import pandas as pd
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable

from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
from featuretools.utils.gen_utils import Library


class Count(AggregationPrimitive):
"""Determines the total number of values, excluding `NaN`.

Examples:
>>> count = Count()
>>> count([1, 2, 3, 4, 5, None])
5
"""

name = "count"
input_types = [ColumnSchema(semantic_tags={"index"})]
return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
stack_on_self = False
default_value = 0
compatibility = [Library.PANDAS, Library.DASK, Library.SPARK]
description_template = "the number"

def get_function(self, agg_type=Library.PANDAS):
if agg_type in [Library.DASK, Library.SPARK]:
return "count"

return pd.Series.count

def generate_name(
self,
base_feature_names,
relationship_path_name,
parent_dataframe_name,
where_str,
use_prev_str,
):
return "COUNT(%s%s%s)" % (relationship_path_name, where_str, use_prev_str)
44 changes: 44 additions & 0 deletions featuretools/primitives/aggregation/standard/entropy.py
@@ -0,0 +1,44 @@
from scipy import stats
from woodwork.column_schema import ColumnSchema

from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
from featuretools.utils.gen_utils import Library


class Entropy(AggregationPrimitive):
"""Calculates the entropy for a categorical column

Description:
Given a list of observations from a categorical
column return the entropy of the distribution.
NaN values can be treated as a category or
dropped.

Args:
dropna (bool): Whether to consider NaN values as a separate category
Defaults to False.
base (float): The logarithmic base to use
Defaults to e (natural logarithm)

Examples:
>>> pd_entropy = Entropy()
>>> pd_entropy([1,2,3,4])
1.3862943611198906
"""

name = "entropy"
input_types = [ColumnSchema(semantic_tags={"category"})]
return_type = ColumnSchema(semantic_tags={"numeric"})
stack_on_self = False
description_template = "the entropy of {}"

def __init__(self, dropna=False, base=None):
self.dropna = dropna
self.base = base

def get_function(self, agg_type=Library.PANDAS):
def pd_entropy(s):
distribution = s.value_counts(normalize=True, dropna=self.dropna)
return stats.entropy(distribution, base=self.base)

return pd_entropy
26 changes: 26 additions & 0 deletions featuretools/primitives/aggregation/standard/first.py
@@ -0,0 +1,26 @@
from woodwork.column_schema import ColumnSchema

from featuretools.primitives.core.aggregation_primitive import AggregationPrimitive
from featuretools.utils.gen_utils import Library


class First(AggregationPrimitive):
"""Determines the first value in a list.

Examples:
>>> first = First()
>>> first([1, 2, 3, 4, 5, None])
1.0
"""

name = "first"
input_types = [ColumnSchema()]
return_type = None
stack_on_self = False
description_template = "the first instance of {}"

def get_function(self, agg_type=Library.PANDAS):
def pd_first(x):
return x.iloc[0]

return pd_first