Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert changes in infer_variable_types #957

Merged
merged 2 commits into from
May 11, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 6 additions & 14 deletions featuretools/utils/entity_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,6 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti
if len(list(secondary_time_index.keys())):
vids_to_assume_datetime.append(list(secondary_time_index.keys())[0])
inferred_type = vtypes.Unknown
df_len = len(df)
if df_len:
sample_df = df.sample(frac=min(10000 / df_len, 1))
if isinstance(df, dd.core.DataFrame):
sample_df = sample_df.compute()
else:
sample_df = df

for variable in df.columns:
if variable in variable_types:
continue
Expand All @@ -43,7 +35,7 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti
'use variable_types to provide type metadata for entity'
raise ValueError(msg)
elif variable in vids_to_assume_datetime:
if col_is_datetime(sample_df[variable]):
if col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
else:
inferred_type = vtypes.Numeric
Expand All @@ -52,15 +44,15 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti
inferred_type = vtypes.Categorical

elif df[variable].dtype == "object":
if not df_len:
if not len(df[variable]):
inferred_type = vtypes.Categorical
elif col_is_datetime(sample_df[variable]):
elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime
else:
inferred_type = vtypes.Categorical

# heuristics to predict this some other than categorical
sample = sample_df[variable]
sample = df[variable].sample(min(10000, len(df[variable])))

# catch cases where object dtype cannot be interpreted as a string
try:
Expand All @@ -79,10 +71,10 @@ def infer_variable_types(df, link_vars, variable_types, time_index, secondary_ti
elif pdtypes.is_numeric_dtype(df[variable].dtype):
inferred_type = vtypes.Numeric

elif col_is_datetime(sample_df[variable]):
elif col_is_datetime(df[variable]):
inferred_type = vtypes.Datetime

elif df_len:
elif len(df[variable]):
sample = df[variable] \
.sample(min(10000, df[variable].nunique(dropna=False)))

Expand Down