From 2281641dfe3d5600defed015521ca3c804dece55 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Tue, 23 Jan 2018 16:36:17 -0500 Subject: [PATCH 01/11] test for passing along extra cutoff_time columns --- .../test_calculate_feature_matrix.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index d2096975a9..599c4cb523 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -490,3 +490,31 @@ def test_cutoff_time_naming(entityset): with pytest.raises(AttributeError): calculate_feature_matrix([dfeat], cutoff_time=cutoff_df_wrong_index_name) + + +def test_cutoff_time_extra_columns(entityset): + es = entityset + + agg_feat = Count(es['customers']['id'], es['regions']) + dfeat = DirectFeature(agg_feat, es['customers']) + + cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), + pd.Timestamp('2011-04-08 10:30:00')], + 'instance_id': [0, 0], + 'label': [True, False]}, + columns=['time', 'instance_id', 'label']) + fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df) + # check column was added to end of matrix + assert 'label' == fm.columns[-1] + # check column was sorted by time labelike the rest of the feature matrix + true_series = pd.Series([False, True], index=[0, 0]) + assert (fm['label'] == true_series).all() + + fm_2 = calculate_feature_matrix([dfeat], + cutoff_time=cutoff_df, + approximate="2 days") + # check column was added to end of matrix + assert 'label' in fm_2.columns + # check column was sorted by time like the rest of the feature matrix + true_series = pd.Series([False, True], index=[0, 0]) + assert (fm_2['label'] == true_series).all() From d863b712e0c32e3d21914a267553d21f4aed9ba1 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Tue, 23 Jan 2018 17:15:49 -0500 Subject: [PATCH 02/11] test for approximate returning correct cutoff times --- .../test_calculate_feature_matrix.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index 599c4cb523..e9b30c4789 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -518,3 +518,23 @@ def test_cutoff_time_extra_columns(entityset): # check column was sorted by time like the rest of the feature matrix true_series = pd.Series([False, True], index=[0, 0]) assert (fm_2['label'] == true_series).all() + + +def test_approximate_returns_original_time_indexes(entityset): + es = entityset + + agg_feat = Count(es['customers']['id'], es['regions']) + dfeat = DirectFeature(agg_feat, es['customers']) + cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), + pd.Timestamp('2011-04-08 10:30:00')], + 'instance_id': [0, 0]}) + + fm = calculate_feature_matrix([dfeat], + cutoff_time=cutoff_df, + approximate="2 days", + cutoff_time_in_index=True) + instance_level_vals = fm.index.get_level_values(0).values + time_level_vals = fm.index.get_level_values(1).values + cutoff_df.sort_values(['time'], inplace=True, kind='mergesort') + assert (instance_level_vals == cutoff_df['instance_id'].values).all() + assert (time_level_vals == cutoff_df['time'].values).all() From 3c62325cbb8123a374f0d5890971b9f7de461b43 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Tue, 23 Jan 2018 18:54:11 -0500 Subject: [PATCH 03/11] test non-approximated fm index, fix return times (wip) --- .../calculate_feature_matrix.py | 21 ++++++------ .../test_calculate_feature_matrix.py | 32 ++++++++++++++----- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py index 16fc99331c..50b7295a03 100644 --- a/featuretools/computational_backends/calculate_feature_matrix.py +++ b/featuretools/computational_backends/calculate_feature_matrix.py @@ -242,29 +242,32 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No feature_matrix = [] for _time_last_to_calc, group in grouped: - time_last = group[cutoff_df_time_var].iloc[0] + # sort group by instance id ids = group['instance_id'].sort_values().values - + time_last = group[cutoff_df_time_var].iloc[0] if no_unapproximated_aggs and approximate is not None: window = None else: window = training_window + # calculate values for those instances at time _time_last_to_calc _feature_matrix = calc_results(_time_last_to_calc, ids, precalculated_features=precalculated_features, training_window=window) # this can occur when the features for an instance are calculated at # multiple cutoff times which were binned to the same frequency. - if len(_feature_matrix) != len(group): - indexer = group[['instance_id', cutoff_df_time_var]] + if approximate: + id_name = _feature_matrix.index.name + indexer = group[['instance_id', target_time]] + _feature_matrix = (indexer.merge(_feature_matrix, left_on=['instance_id'], right_index=True, how='left') - .set_index('instance_id') - .drop([cutoff_df_time_var], axis=1)) - - time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time') - _feature_matrix.set_index(time_index, append=True, inplace=True) + .set_index(['instance_id', target_time])) + _feature_matrix.index.set_names([id_name, 'time'], inplace=True) + else: + time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time') + _feature_matrix.set_index(time_index, append=True, inplace=True) feature_matrix.append(_feature_matrix) feature_matrix = pd.concat(feature_matrix) diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index e9b30c4789..125e1b5b99 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -520,21 +520,37 @@ def test_cutoff_time_extra_columns(entityset): assert (fm_2['label'] == true_series).all() -def test_approximate_returns_original_time_indexes(entityset): +def test_cfm_returns_original_time_indexes(entityset): es = entityset agg_feat = Count(es['customers']['id'], es['regions']) dfeat = DirectFeature(agg_feat, es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), + pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], - 'instance_id': [0, 0]}) + 'instance_id': [0, 1, 0]}) + sorted_df = cutoff_df.sort_values(['time'], kind='mergesort') - fm = calculate_feature_matrix([dfeat], - cutoff_time=cutoff_df, - approximate="2 days", + # no approximate + fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, cutoff_time_in_index=True) instance_level_vals = fm.index.get_level_values(0).values time_level_vals = fm.index.get_level_values(1).values - cutoff_df.sort_values(['time'], inplace=True, kind='mergesort') - assert (instance_level_vals == cutoff_df['instance_id'].values).all() - assert (time_level_vals == cutoff_df['time'].values).all() + assert (instance_level_vals == sorted_df['instance_id'].values).all() + assert (time_level_vals == sorted_df['time'].values).all() + + # approximate, in different windows + fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, + cutoff_time_in_index=True, approximate="1 m") + instance_level_vals = fm.index.get_level_values(0).values + time_level_vals = fm.index.get_level_values(1).values + assert (instance_level_vals == sorted_df['instance_id'].values).all() + assert (time_level_vals == sorted_df['time'].values).all() + + # approximate, in same window + fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, + cutoff_time_in_index=True, approximate="2 d") + instance_level_vals = fm.index.get_level_values(0).values + time_level_vals = fm.index.get_level_values(1).values + assert (instance_level_vals == sorted_df['instance_id'].values).all() + assert (time_level_vals == sorted_df['time'].values).all() From b1ee424f6ac1ca9ddaf2b46f4f5a5eb91feba4e9 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Wed, 24 Jan 2018 11:23:30 -0500 Subject: [PATCH 04/11] sort feature matrix by time, then instance --- .../calculate_feature_matrix.py | 12 ++++++------ .../test_calculate_feature_matrix.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py index 50b7295a03..689ce7419c 100644 --- a/featuretools/computational_backends/calculate_feature_matrix.py +++ b/featuretools/computational_backends/calculate_feature_matrix.py @@ -258,13 +258,13 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No if approximate: id_name = _feature_matrix.index.name indexer = group[['instance_id', target_time]] - - _feature_matrix = (indexer.merge(_feature_matrix, - left_on=['instance_id'], - right_index=True, - how='left') - .set_index(['instance_id', target_time])) + _feature_matrix = indexer.merge(_feature_matrix, + left_on=['instance_id'], + right_index=True, + how='left') + _feature_matrix.set_index(['instance_id', target_time], inplace=True) _feature_matrix.index.set_names([id_name, 'time'], inplace=True) + _feature_matrix.sort_index(level=1, kind='mergesort', inplace=True) else: time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time') _feature_matrix.set_index(time_index, append=True, inplace=True) diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index 125e1b5b99..a46f538a38 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -529,7 +529,7 @@ def test_cfm_returns_original_time_indexes(entityset): pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], 'instance_id': [0, 1, 0]}) - sorted_df = cutoff_df.sort_values(['time'], kind='mergesort') + sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort') # no approximate fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, @@ -540,17 +540,17 @@ def test_cfm_returns_original_time_indexes(entityset): assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows - fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, - cutoff_time_in_index=True, approximate="1 m") - instance_level_vals = fm.index.get_level_values(0).values - time_level_vals = fm.index.get_level_values(1).values + fm2 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, + cutoff_time_in_index=True, approximate="1 m") + instance_level_vals = fm2.index.get_level_values(0).values + time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window - fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, - cutoff_time_in_index=True, approximate="2 d") - instance_level_vals = fm.index.get_level_values(0).values - time_level_vals = fm.index.get_level_values(1).values + fm3 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, + cutoff_time_in_index=True, approximate="2 d") + instance_level_vals = fm3.index.get_level_values(0).values + time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() From d276b75fc3183c4d807b3f946118bad9a3c3b455 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Wed, 24 Jan 2018 13:39:05 -0500 Subject: [PATCH 05/11] take extra columns of cutoff_time and append them to calculated feature matrix --- .../calculate_feature_matrix.py | 28 +++++++++++---- .../test_calculate_feature_matrix.py | 9 ++--- .../tests/integration_data/log_int.csv | 36 +++++++++---------- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py index 689ce7419c..0e45a4b158 100644 --- a/featuretools/computational_backends/calculate_feature_matrix.py +++ b/featuretools/computational_backends/calculate_feature_matrix.py @@ -99,6 +99,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None, entityset = features[0].entityset target_entity = features[0].entity + pass_columns = [] if not isinstance(cutoff_time, pd.DataFrame): if cutoff_time is None: @@ -130,6 +131,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None, # take the first column that isn't instance_id and assume it is time not_instance_id = [c for c in cutoff_time.columns if c != "instance_id"] cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True) + pass_columns = [column_name for column_name in cutoff_time.columns[2:]] # Get dictionary of features to approximate if approximate is not None: @@ -186,7 +188,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None, training_window, profile, verbose, save_progress, backend, no_unapproximated_aggs, cutoff_df_time_var, - target_time) + target_time, pass_columns) feature_matrix.append(_feature_matrix) # Do a manual garbage collection in case objects from calculate_batch # weren't collected automatically @@ -202,9 +204,11 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None, return feature_matrix -def calculate_batch(features, group, approximate, entityset, backend_verbose, training_window, - profile, verbose, save_progress, backend, - no_unapproximated_aggs, cutoff_df_time_var, target_time): +def calculate_batch(features, group, approximate, entityset, backend_verbose, + training_window, profile, verbose, save_progress, backend, + no_unapproximated_aggs, cutoff_df_time_var, target_time, + pass_columns): + # if approximating, calculate the approximate features if approximate is not None: precalculated_features, all_approx_feature_set = approximate_features(features, group, @@ -217,6 +221,8 @@ def calculate_batch(features, group, approximate, entityset, backend_verbose, tr precalculated_features = None all_approx_feature_set = None + # if backend verbose wasn't set explicitly, set to True if verbose is true + # and there is only 1 cutoff time if backend_verbose is None: one_cutoff_time = group[cutoff_df_time_var].nunique() == 1 backend_verbose = verbose and one_cutoff_time @@ -231,6 +237,7 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No verbose=backend_verbose) return matrix + # if all aggregations have been approximated, can calculate all together if no_unapproximated_aggs and approximate is not None: grouped = [[datetime.now(), group]] else: @@ -255,9 +262,10 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No # this can occur when the features for an instance are calculated at # multiple cutoff times which were binned to the same frequency. + id_name = _feature_matrix.index.name + if approximate: - id_name = _feature_matrix.index.name - indexer = group[['instance_id', target_time]] + indexer = group[['instance_id', target_time] + pass_columns] _feature_matrix = indexer.merge(_feature_matrix, left_on=['instance_id'], right_index=True, @@ -268,6 +276,14 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No else: time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time') _feature_matrix.set_index(time_index, append=True, inplace=True) + if pass_columns: + pass_through = group[['instance_id', cutoff_df_time_var] + pass_columns] + pass_through.rename(columns={'instance_id': id_name, + cutoff_df_time_var: 'time'}, + inplace=True) + pass_through.set_index([id_name, 'time'], inplace=True) + for col in pass_columns: + _feature_matrix[col] = pass_through[col] feature_matrix.append(_feature_matrix) feature_matrix = pd.concat(feature_matrix) diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index a46f538a38..e798ea7d92 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -499,15 +499,16 @@ def test_cutoff_time_extra_columns(entityset): dfeat = DirectFeature(agg_feat, es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), + pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], - 'instance_id': [0, 0], - 'label': [True, False]}, + 'instance_id': [0, 1, 0], + 'label': [True, True, False]}, columns=['time', 'instance_id', 'label']) fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df) # check column was added to end of matrix assert 'label' == fm.columns[-1] # check column was sorted by time labelike the rest of the feature matrix - true_series = pd.Series([False, True], index=[0, 0]) + true_series = pd.Series([False, True, True], index=[0, 1, 0]) assert (fm['label'] == true_series).all() fm_2 = calculate_feature_matrix([dfeat], @@ -516,7 +517,7 @@ def test_cutoff_time_extra_columns(entityset): # check column was added to end of matrix assert 'label' in fm_2.columns # check column was sorted by time like the rest of the feature matrix - true_series = pd.Series([False, True], index=[0, 0]) + true_series = pd.Series([False, True, True], index=[0, 1, 0]) assert (fm_2['label'] == true_series).all() diff --git a/featuretools/tests/integration_data/log_int.csv b/featuretools/tests/integration_data/log_int.csv index 38f943e2b2..f3845582fc 100644 --- a/featuretools/tests/integration_data/log_int.csv +++ b/featuretools/tests/integration_data/log_int.csv @@ -1,4 +1,4 @@ -comments,datetime,id,priority_level,product_id,purchased,session_id,value,value_2,value_many_nans +comments,datetime,id,latlong,latlong2,priority_level,product_id,purchased,session_id,value,value_2,value_many_nans " When it comes to Coca-Cola products, people tend to be die-hard fans. Many of us know someone who can't go a day without a Diet Coke (or two or three). And while Diet Coke has been a leading sugar-free soft drink since it was first released in 1982, it came to light that young adult males shied away from this beverage — identifying diet cola as a woman's drink. The company's answer to that predicament came in 2005 - in the form of a shiny black can - with the release of Coca-Cola Zero. @@ -40,9 +40,9 @@ Coca-Cola Zero: ""Has more of a sharply sweet aftertaste I associate with diet s Overall comments: ""That was a lot more difficult than I though it would be."" ""Both equally palatable."" A few people said Diet Coke tasted much better ... unbeknownst to them, they were actually referring to Coca-Cola Zero. IN SUMMARY: It is a real toss up. There is not one artificially-sweetened Coca-Cola beverage that outshines the other. So how do people choose between one or the other? It is either a matter of personal taste, or maybe the marketing campaigns will influence their choice. -",0,0,0,coke zero,True,0,0.0,0.0, -I loved it,1,1,0,coke zero,True,0,5.0,2.0, -I loved it,2,2,1,coke zero,True,0,10.0,4.0, +",0,0,"(0, 0)","(0, 0)",0,coke zero,True,0,0.0,0.0, +I loved it,1,1,"(5, 2)","(2, -5)",0,coke zero,True,0,5.0,2.0, +I loved it,2,2,"(10, 4)","(4, -10)",1,coke zero,True,0,10.0,4.0, " The full-size pickup truck and the V-8 engine were supposed to be inseparable, like the internet and cat videos. You can’t have one without the other—or so we thought. @@ -63,7 +63,7 @@ For the most part, though, the equipment in this particular Lariat lives up to t Middle-Child Syndrome In the F-150, Ford has a trifecta of engines (the fourth, a naturally aspirated 3.5-liter V-6, is best left to the fleet operators). The 2.7-liter twin-turbo V-6 delivers remarkable performance at an affordable price. The 3.5-liter twin-turbo V-6 is the workhorse, with power, torque, and hauling capability to spare. Compared with those two logical options, the middle-child 5.0-liter V-8 is the right-brain choice. Its strongest selling points may be its silky power delivery and the familiar V-8 rumble. That’s a flimsy argument when it comes to rationalizing a $50,000-plus purchase, though, so perhaps it’s no surprise that today’s boosted six-cylinders are now the engines of choice in the F-150. -",3,3,1,car,True,0,15.0,6.0, +",3,3,"(15, 6)","(6, -15)",1,car,True,0,15.0,6.0, " THE GOOD The Tesla Model S 90D's electric drivetrain is substantially more efficient than any internal combustion engine, and gives the car smooth and quick acceleration. All-wheel drive comes courtesy of a smart dual motor system. The new Autopilot feature eases the stress of stop-and-go traffic and long road trips. @@ -129,7 +129,7 @@ The 2016 Tesla Model S 90D adds features to keep it competitive against the inte Lengthy charging times mean longer trips are either out of the question or require more planning than with an internal combustion car. And while the infotainment system responds quickly to touch inputs and offers useful screens, it hasn't changed much in four years. Most notably, Tesla hasn't added any music apps beyond the ones it launched with. Along with new, useful apps, it would be nice to have some themes or other aesthetic changes to the infotainment interface. The Model S 90D's base price of $88,000 puts it out of reach of the average buyer, and the model I drove was optioned up to around $95,000. Against its Audi, BMW and Mercedes-Benz competition, however, it makes a compelling argument, especially for its uncomplicated nature. -",4,4,1,car,True,0,20.0,8.0, +",4,4,"(20, 8)","(8, -20)",1,car,True,0,20.0,8.0, " Toothpaste can do more harm than good @@ -221,12 +221,12 @@ But now I’m tired of talking about toothpaste. Next topic? I’m bringing pyorrhea back. - ",5,5,1,toothpaste,True,1,0.0,0.0,0.0 + ",5,5,"(0, 0)","(0, 0)",1,toothpaste,True,1,0.0,0.0,0.0 " I’ve been a user of Colgate Total Whitening Toothpaste for many years because I’ve always tried to maintain a healthy smile (I’m a receptionist so I need a white smile). But because I drink coffee at least twice a day (sometimes more!) and a lot of herbal teas, I’ve found that using just this toothpaste alone doesn’t really get my teeth white... The best way to get white teeth is to really try some professional products specifically for tooth whitening. I’ve tried a few products, like Crest White Strips and found that the strips are really not as good as the trays. Although the Crest White Strips are easy to use, they really DO NOT cover your teeth perfectly like some other professional dental whitening kits. This Product did cover my teeth well however because of their custom heat trays, and whitening my teeth A LOT. I would say if you really want white teeth, use the Colgate Toothpaste and least 2 times a day, along side a professional Gel product like Shine Whitening. - ",6,6,1,toothpaste,True,1,1.0,1.0,1.0 + ",6,6,"(1, 1)","(1, -1)",1,toothpaste,True,1,1.0,1.0,1.0 " The first feature is the price, and it is right. @@ -239,10 +239,10 @@ Whitening is important. This one is supposed ot whiten. After spending money to Avoiding all kinds of oral pathology is a major consideration. This toothpaste claims that it can help fight cavities, gingivitis, plaque, tartar, and bad breath. I hope this product stays on the market a long time and does not change. - ",7,7,0,toothpaste,True,1,2.0,2.0,2.0 + ",7,7,"(2, 2)","(2, -2)",0,toothpaste,True,1,2.0,2.0,2.0 " These bags looked exactly like I'd hoped, however, the handles broke off of almost every single bag as soon as items were placed in them! I used these as gift bags for out-of-town guests at my wedding, so imagine my embarassment as the handles broke off as I was handing them out. I would not recommend purchaing these bags unless you plan to fill them with nothing but paper! Anything heavier will cause the handles to snap right off. -",8,8,0,brown bag,True,1,3.0,3.0,3.0 +",8,8,"(3, 3)","(3, -3)",0,brown bag,True,1,3.0,3.0,3.0 " I purchased these in August 2014 from Big Blue Supplies. I have no problem with the seller, these arrived new condition, fine shape. @@ -257,7 +257,7 @@ Even the dollar store bags I normally purchase do not have that stamped on the b I do not think I would purchase again for all the reasons stated above. Another thing for those still wanting to purchase, the ones I received were: 12 3/4 inches high not including handle, 10 1/4 inches wide and a 5 1/4 inch depth. -",9,9,0,brown bag,True,2,0.0,0.0,0.0 +",9,9,"(0, 0)","(0, 0)",0,brown bag,True,2,0.0,0.0,0.0 " The place: BMO Harris Bradley Center The event: Bucks VS Spurs @@ -285,14 +285,14 @@ Not a word was said, but a diaper was thrown over the stall. I catch it, line my My son asks me, ""Daddy, why are we leaving early?"" ""Well son, I need to change my diaper"" -",9,10,0,Haribo sugar-free gummy bears,True,3,0.0,0.0, -I loved it,10,11,0,coke zero,False,3,5.0,2.0, -I loved it,11,12,0,coke zero,False,4,0.0,0.0,0.0 -I loved it,12,13,2,coke zero,False,4,7.0,3.0,3.0 -I loved it,13,14,2,coke zero,False,4,14.0,6.0,6.0 +",9,10,"(0, 0)","(0, 0)",0,Haribo sugar-free gummy bears,True,3,0.0,0.0, +I loved it,10,11,"(5, 2)","(2, -5)",0,coke zero,False,3,5.0,2.0, +I loved it,11,12,"(0, 0)","(0, 0)",0,coke zero,False,4,0.0,0.0,0.0 +I loved it,12,13,"(7, 3)","(3, -7)",2,coke zero,False,4,7.0,3.0,3.0 +I loved it,13,14,"(14, 6)","(6, -14)",2,coke zero,False,4,14.0,6.0,6.0 " This timer does what it is supposed to do. Setup is elementary. Replacing the old one (after 12 years) was relatively easy. It has performed flawlessly since. I'm delighted I could find an esoteric product like this at Amazon. Their service, and the customer reviews, are just excellent. -",14,15,1,taco clock,True,5,,, +",14,15,"(nan, nan)","(nan, nan)",1,taco clock,True,5,,, " Funny, cute clock. A little spendy for how light the clock is, but its hard to find a taco clock. -",15,16,1,taco clock,False,5,,, +",15,16,"(nan, nan)","(nan, nan)",1,taco clock,False,5,,, From bcb7578acecbac7effce44641d213ff387b4f6cf Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Wed, 24 Jan 2018 15:09:52 -0500 Subject: [PATCH 06/11] linting --- .../calculate_feature_matrix.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py index 0e45a4b158..9e2b8afcf1 100644 --- a/featuretools/computational_backends/calculate_feature_matrix.py +++ b/featuretools/computational_backends/calculate_feature_matrix.py @@ -210,13 +210,15 @@ def calculate_batch(features, group, approximate, entityset, backend_verbose, pass_columns): # if approximating, calculate the approximate features if approximate is not None: - precalculated_features, all_approx_feature_set = approximate_features(features, - group, - window=approximate, - entityset=entityset, - training_window=training_window, - verbose=backend_verbose, - profile=profile) + precalculated_features, all_approx_feature_set = approximate_features( + features, + group, + window=approximate, + entityset=entityset, + training_window=training_window, + verbose=backend_verbose, + profile=profile + ) else: precalculated_features = None all_approx_feature_set = None @@ -258,12 +260,15 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No window = training_window # calculate values for those instances at time _time_last_to_calc - _feature_matrix = calc_results(_time_last_to_calc, ids, precalculated_features=precalculated_features, training_window=window) + _feature_matrix = calc_results(_time_last_to_calc, + ids, + precalculated_features=precalculated_features, + training_window=window) - # this can occur when the features for an instance are calculated at - # multiple cutoff times which were binned to the same frequency. id_name = _feature_matrix.index.name + # if approximate, merge feature matrix with group frame to get original + # cutoff times and passed columns if approximate: indexer = group[['instance_id', target_time] + pass_columns] _feature_matrix = indexer.merge(_feature_matrix, @@ -274,9 +279,11 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No _feature_matrix.index.set_names([id_name, 'time'], inplace=True) _feature_matrix.sort_index(level=1, kind='mergesort', inplace=True) else: - time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time') + # all rows have same cutoff time. set time and add passed columns + num_rows = _feature_matrix.shape[0] + time_index = pd.DatetimeIndex([time_last] * num_rows, name='time') _feature_matrix.set_index(time_index, append=True, inplace=True) - if pass_columns: + if len(pass_columns) > 0: pass_through = group[['instance_id', cutoff_df_time_var] + pass_columns] pass_through.rename(columns={'instance_id': id_name, cutoff_df_time_var: 'time'}, From d8a33ea3816d144963783c485b747cb3d3d38b17 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Wed, 24 Jan 2018 16:59:33 -0500 Subject: [PATCH 07/11] added more cases to original time test --- .../test_calculate_feature_matrix.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py index e798ea7d92..4d5167e73f 100644 --- a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py +++ b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py @@ -526,6 +526,7 @@ def test_cfm_returns_original_time_indexes(entityset): agg_feat = Count(es['customers']['id'], es['regions']) dfeat = DirectFeature(agg_feat, es['customers']) + agg_feat_2 = Count(es['sessions']['id'], es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], @@ -540,7 +541,7 @@ def test_cfm_returns_original_time_indexes(entityset): assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() - # approximate, in different windows + # approximate, in different windows, no unapproximated aggs fm2 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values @@ -548,10 +549,26 @@ def test_cfm_returns_original_time_indexes(entityset): assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() - # approximate, in same window + # approximate, in different windows, unapproximated aggs + fm2 = calculate_feature_matrix([dfeat, agg_feat_2], cutoff_time=cutoff_df, + cutoff_time_in_index=True, approximate="1 m") + instance_level_vals = fm2.index.get_level_values(0).values + time_level_vals = fm2.index.get_level_values(1).values + assert (instance_level_vals == sorted_df['instance_id'].values).all() + assert (time_level_vals == sorted_df['time'].values).all() + + # approximate, in same window, no unapproximated aggs fm3 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() + + # approximate, in same window, unapproximated aggs + fm3 = calculate_feature_matrix([dfeat, agg_feat_2], cutoff_time=cutoff_df, + cutoff_time_in_index=True, approximate="2 d") + instance_level_vals = fm3.index.get_level_values(0).values + time_level_vals = fm3.index.get_level_values(1).values + assert (instance_level_vals == sorted_df['instance_id'].values).all() + assert (time_level_vals == sorted_df['time'].values).all() From 2c70c877a945c9ced2e99027f26f48d836e1dbe3 Mon Sep 17 00:00:00 2001 From: rwedge <5392142+rwedge@users.noreply.github.com> Date: Thu, 25 Jan 2018 11:31:09 -0500 Subject: [PATCH 08/11] changed circle ci image --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index be183666c3..9081fef518 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,7 +3,7 @@ jobs: build: working_directory: ~/featuretools docker: - - image: themattrix/tox + - image: painless/tox steps: - checkout - run: pyenv local 2.7.13 3.5.2 3.6.0 From 79de7d09001cb58a5d0f327f39af817dcdec8d40 Mon Sep 17 00:00:00 2001 From: Seth-Rothschild Date: Mon, 29 Jan 2018 14:34:00 -0500 Subject: [PATCH 09/11] Document additional_columns API --- .../automated_feature_engineering/handling_time.rst | 10 ++++++++++ .../computational_backends/calculate_feature_matrix.py | 3 ++- featuretools/synthesis/dfs.py | 3 ++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/source/automated_feature_engineering/handling_time.rst b/docs/source/automated_feature_engineering/handling_time.rst index 71c75a67f5..398273ef8d 100644 --- a/docs/source/automated_feature_engineering/handling_time.rst +++ b/docs/source/automated_feature_engineering/handling_time.rst @@ -71,8 +71,18 @@ There is one row in the feature matrix corresponding to a row in ``cutoff_times` cutoff_time_in_index=True) feature_matrix +It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the two columns will be used as an index and cutoff time, any additional columns will appear as features in the resulting feature matrix. +.. ipython:: python + + cutoff_times['labels'] = pd.Series([0, 0, 1, 0, 1]) + + feature_matrix, features = ft.dfs(entityset=es, + target_entity="customers", + cutoff_time=cutoff_times, + cutoff_time_in_index=True) + feature_matrix['labels'] Running DFS with training windows --------------------------------- diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py index 9e2b8afcf1..e441d93868 100644 --- a/featuretools/computational_backends/calculate_feature_matrix.py +++ b/featuretools/computational_backends/calculate_feature_matrix.py @@ -43,7 +43,8 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None, the features for each instance at. Can either be a DataFrame with 'instance_id' and 'time' columns, DataFrame with the name of the index variable in the target entity and a time column, a list of values, or a single - value to calculate for all instances. + value to calculate for all instances. If the dataframe has more than two columns, any additional + columns will be added to the resulting feature matrix. instance_ids (list): list of instances to calculate features on. Only used if cutoff_time is a single datetime. diff --git a/featuretools/synthesis/dfs.py b/featuretools/synthesis/dfs.py index eba930c1f1..13611fd5cc 100644 --- a/featuretools/synthesis/dfs.py +++ b/featuretools/synthesis/dfs.py @@ -51,7 +51,8 @@ def dfs(entities=None, the features for each instance at. Can either be a DataFrame with 'instance_id' and 'time' columns, DataFrame with the name of the index variable in the target entity and a time column, a list of values, or a single - value to calculate for all instances. + value to calculate for all instances. If the dataframe has more than two columns, any additional + columns will be added to the resulting feature matrix. instance_ids (list): list of instances to calculate features on. Only used if cutoff_time is a single datetime. From 6751d77def6d4e2bcf0f36d165bee1f1964b1497 Mon Sep 17 00:00:00 2001 From: Seth-Rothschild Date: Tue, 30 Jan 2018 10:22:01 -0500 Subject: [PATCH 10/11] labels -> label, update handling time phrasing --- docs/source/automated_feature_engineering/handling_time.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/automated_feature_engineering/handling_time.rst b/docs/source/automated_feature_engineering/handling_time.rst index 398273ef8d..70bc4e0030 100644 --- a/docs/source/automated_feature_engineering/handling_time.rst +++ b/docs/source/automated_feature_engineering/handling_time.rst @@ -71,18 +71,18 @@ There is one row in the feature matrix corresponding to a row in ``cutoff_times` cutoff_time_in_index=True) feature_matrix -It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the two columns will be used as an index and cutoff time, any additional columns will appear as features in the resulting feature matrix. +It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the first two columns will be used as an index and cutoff time regardless of their order in the dataframe, any additional columns will appear as features in the resulting feature matrix. .. ipython:: python - cutoff_times['labels'] = pd.Series([0, 0, 1, 0, 1]) + cutoff_times['label'] = pd.Series([0, 0, 1, 0, 1]) feature_matrix, features = ft.dfs(entityset=es, target_entity="customers", cutoff_time=cutoff_times, cutoff_time_in_index=True) - feature_matrix['labels'] + feature_matrix['label'] Running DFS with training windows --------------------------------- From e58a64614a2c600b1610f6f3be598edd235108c8 Mon Sep 17 00:00:00 2001 From: Seth-Rothschild Date: Tue, 30 Jan 2018 11:11:50 -0500 Subject: [PATCH 11/11] cutoff time column ordering in docs --- docs/source/automated_feature_engineering/handling_time.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/automated_feature_engineering/handling_time.rst b/docs/source/automated_feature_engineering/handling_time.rst index 70bc4e0030..94aa2eedf8 100644 --- a/docs/source/automated_feature_engineering/handling_time.rst +++ b/docs/source/automated_feature_engineering/handling_time.rst @@ -71,7 +71,7 @@ There is one row in the feature matrix corresponding to a row in ``cutoff_times` cutoff_time_in_index=True) feature_matrix -It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the first two columns will be used as an index and cutoff time regardless of their order in the dataframe, any additional columns will appear as features in the resulting feature matrix. +It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the first column will be treated as the index and the second as a cutoff time, any additional columns will appear as features in the resulting feature matrix. .. ipython:: python