diff --git a/featuretools/utils/entity_utils.py b/featuretools/utils/entity_utils.py index 3fb6e9499b..495b5eeea4 100644 --- a/featuretools/utils/entity_utils.py +++ b/featuretools/utils/entity_utils.py @@ -27,14 +27,6 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti if len(list(secondary_time_index.keys())): vids_to_assume_datetime.append(list(secondary_time_index.keys())[0]) inferred_type = vtypes.Unknown - df_len = len(df) - if df_len: - sample_df = df.sample(frac=min(10000 / df_len, 1)) - if isinstance(df, dd.core.DataFrame): - sample_df = sample_df.compute() - else: - sample_df = df - for variable in df.columns: if variable in variable_types: continue @@ -43,7 +35,7 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti 'use variable_types to provide type metadata for entity' raise ValueError(msg) elif variable in vids_to_assume_datetime: - if col_is_datetime(sample_df[variable]): + if col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Numeric @@ -52,15 +44,15 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti inferred_type = vtypes.Categorical elif df[variable].dtype == "object": - if not df_len: + if not len(df[variable]): inferred_type = vtypes.Categorical - elif col_is_datetime(sample_df[variable]): + elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime else: inferred_type = vtypes.Categorical # heuristics to predict this some other than categorical - sample = sample_df[variable] + sample = df[variable].sample(min(10000, len(df[variable]))) # catch cases where object dtype cannot be interpreted as a string try: @@ -79,10 +71,10 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti elif pdtypes.is_numeric_dtype(df[variable].dtype): inferred_type = vtypes.Numeric - elif col_is_datetime(sample_df[variable]): + elif col_is_datetime(df[variable]): inferred_type = vtypes.Datetime - elif df_len: + elif len(df[variable]): sample = df[variable] \ .sample(min(10000, df[variable].nunique(dropna=False)))