Conversation
Codecov Report
@@ Coverage Diff @@
## master #336 +/- ##
==========================================
+ Coverage 95.28% 95.34% +0.05%
==========================================
Files 74 74
Lines 7783 7792 +9
==========================================
+ Hits 7416 7429 +13
+ Misses 367 363 -4
Continue to review full report at Codecov.
|
featuretools/entityset/entity.py
Outdated
| inferred_variable_types = self.infer_variable_types(ignore=list(variable_types.keys()), | ||
| link_vars=link_vars) | ||
|
|
||
| for var_id, desired_type in variable_types.items(): |
There was a problem hiding this comment.
can simplify loop to inferred_variable_types.update(variable_types)
featuretools/entityset/entity.py
Outdated
| dataframe, and create a new column of that name using integers the (0, len(dataframe)). | ||
| Otherwise, assume index exists in dataframe. | ||
| """ | ||
| assert is_string(id), "Entity id must be a string" |
There was a problem hiding this comment.
maybe add a function from line 69 - 76 called _validate_entity_params(id, time_index, columns)
featuretools/entityset/entity.py
Outdated
| return False | ||
|
|
||
|
|
||
| def create_index(index, make_index, df): |
There was a problem hiding this comment.
add short comment describing function
something like "handles index creation logic"
There was a problem hiding this comment.
Maybe this should also be private
featuretools/entityset/entity.py
Outdated
| index = df.columns[0] | ||
| elif make_index and index in df.columns: | ||
| raise RuntimeError("Cannot make index: index variable already present") | ||
| elif make_index or index not in df.columns: |
There was a problem hiding this comment.
i think this last elif block can be simplified to
elif index not in df.columns:
if not make_index:
logger.warning("index %s not found in dataframe, creating new "
"integer column", index)
df.insert(0, index, range(0, len(df)))
created_index = indexat this point if index is in df.columns then make_index must be false and we should skip this block. this is because we handle the case where make_index and index in df.columns is True above
There was a problem hiding this comment.
if my logic is correct might be helpful to leave a comment about this entire if/elif block so people can quickly understand
featuretools/entityset/entity.py
Outdated
| if index not in variable_types: | ||
| variable_types[index] = vtypes.Index | ||
|
|
||
| self.data = {"df": df, |
There was a problem hiding this comment.
can we avoid setting the data like this? perhaps by just making df an argument to some of the other functions?
it feels wrong to set the data here and then call self.update_data(...) a second time at the end
There was a problem hiding this comment.
should we just call the relevant parts of self.update_data separately and skip calling update_data?. It already expects the indexes to be set and it's a little strange to be "updating" the dataframe on initialization
There was a problem hiding this comment.
ya, i agree with that approach
featuretools/entityset/entity.py
Outdated
| self.convert_all_variable_data(inferred_variable_types) | ||
| variable_types = variable_types or {} | ||
| secondary_time_index = secondary_time_index or {} | ||
| self.create_variables(variable_types, index, time_index, secondary_time_index) |
There was a problem hiding this comment.
i think this should be self._create_variables since a user should never need call it, right?
featuretools/entityset/entity.py
Outdated
| self.convert_all_variable_data(inferred_variable_types) | ||
| variable_types = variable_types or {} | ||
| secondary_time_index = secondary_time_index or {} | ||
| self.create_variables(variable_types, index, time_index, secondary_time_index) |
There was a problem hiding this comment.
i think secondary_time_index is getting modified by reference within this function (actually when it gets passed to infer_variable_types). if that's true, i'd say let's avoid that or at least leave a comment.
There was a problem hiding this comment.
This code here, right?
secondary_time_index = secondary_time_index or {}
for ti, cols in secondary_time_index.items():
if ti not in cols:
cols.append(ti)The first line should be thrown out and mabye the rest should go in _validate_entity_params with either a warning or an error if the secondary time index column isn't included.
featuretools/entityset/entity.py
Outdated
| if v.id != index] | ||
|
|
||
| def infer_variable_types(self, variable_types, time_index, secondary_time_index): | ||
| """Extracts the variables from a dataframe |
…y has no time index
|
Looks good to merge |
The idea behind this PR is to make it easier to find where in the code we handle data type and variable type conversion when creating a new entity. Currently it's handled in two places:
EntitySet._import_from_dataframeandEntity.__init__.This PR moves the relevant logic in
EntitySet._import_from_dataframeintoEntity.__init__and removes a couple statements that try to infer variable types that can be handled byEntity.infer_variable_typesinstead.