From 36bdc655ea1c72f4e0322ff4c5630eaf725f2159 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Fri, 8 May 2020 14:47:30 -0500 Subject: [PATCH 1/2] update infer variable types --- featuretools/utils/entity_utils.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/featuretools/utils/entity_utils.py b/featuretools/utils/entity_utils.py index 3fb6e9499b..7a2aa890ea 100644 --- a/featuretools/utils/entity_utils.py +++ b/featuretools/utils/entity_utils.py @@ -27,13 +27,6 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown - df_len = len(df) - if df_len: - sample_df = df.sample(frac=min(10000 / df_len, 1)) - if isinstance(df, dd.core.DataFrame): - sample_df = sample_df.compute() - else: - sample_df = df for variable in df.columns: if variable in variable_types: @@ -43,7 +36,7 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti 'use variable_types to provide type metadata for entity' raise ValueError(msg) elif variable in vids_to_assume_datetime: - if col_is_datetime(sample_df[variable]): + if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric @@ -52,15 +45,15 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti inferred_type = vtypes.Categorical elif df[variable].dtype == "object": - if not df_len: + if not len(df[variable]): inferred_type = vtypes.Categorical - elif col_is_datetime(sample_df[variable]): + elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Categorical # heuristics to predict this some other than categorical - sample = sample_df[variable] + sample = df[variable].sample(min(10000, len(df[variable]))) # catch cases where object dtype cannot be interpreted as a string try: @@ -79,10 +72,10 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti elif pdtypes.is_numeric_dtype(df[variable].dtype): inferred_type = vtypes.Numeric - elif col_is_datetime(sample_df[variable]): + elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime - elif df_len: + elif len(df[variable]): sample = df[variable] \ .sample(min(10000, df[variable].nunique(dropna=False))) From 0ea6e47ea49bd56f4b9d9ae33da30e814ea95066 Mon Sep 17 00:00:00 2001 From: Nate Parsons Date: Fri, 8 May 2020 14:52:51 -0500 Subject: [PATCH 2/2] remove unnecessary change --- featuretools/utils/entity_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/featuretools/utils/entity_utils.py b/featuretools/utils/entity_utils.py index 7a2aa890ea..495b5eeea4 100644 --- a/featuretools/utils/entity_utils.py +++ b/featuretools/utils/entity_utils.py @@ -27,7 +27,6 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown - for variable in df.columns: if variable in variable_types: continue