Skip to content

Commit

Permalink
Check dtypes match when adding relationship (#159)
Browse files Browse the repository at this point in the history
* attempt to cast categorical dtype to int before object; check dtypes match when adding relationship

* remove unused Entity.attempt_cast_index_to_int

* test dfs on mock_customer demo dataset

* test dtype mismatch

* linting

* fix assert for python 3
  • Loading branch information
rwedge committed May 30, 2018
1 parent 1794707 commit 70591f3
Show file tree
Hide file tree
Showing 14 changed files with 137 additions and 102 deletions.
2 changes: 1 addition & 1 deletion featuretools/demo/mock_customer.py
Expand Up @@ -20,7 +20,7 @@ def load_mock_customer(n_customers=5, n_products=5, n_sessions=35, n_transaction
customers_df["zip_code"] = choice(["60091", "02139"], n_customers,)
customers_df["join_date"] = pd.date_range('1/1/2008', periods=n_customers, freq='50d') # todo make these less regular

products_df = pd.DataFrame({"product_id": range(1, n_products + 1)})
products_df = pd.DataFrame({"product_id": pd.Categorical(range(1, n_products + 1))})
products_df["brand"] = choice(["A", "B", "C"], n_products)

sessions_df = pd.DataFrame({"session_id": range(1, n_sessions + 1)})
Expand Down
20 changes: 10 additions & 10 deletions featuretools/entityset/base_entityset.py
@@ -1,8 +1,7 @@
import logging
from builtins import object

import pandas as pd
from pandas.api.types import is_numeric_dtype, is_object_dtype, is_string_dtype
from pandas.api.types import is_dtype_equal

from featuretools import variable_types as vtypes

Expand Down Expand Up @@ -198,14 +197,15 @@ def add_relationship(self, relationship):
parent_e.convert_variable_type(variable_id=parent_v,
new_type=vtypes.Index,
convert_data=False)
if ((is_object_dtype(parent_e.df[parent_v]) or
is_string_dtype(parent_e.df[parent_v])) and
is_numeric_dtype(child_e.df[child_v])):
parent_e.df[parent_v] = pd.to_numeric(parent_e.df[parent_v])
if ((is_object_dtype(child_e.df[child_v]) or
is_string_dtype(child_e.df[child_v])) and
is_numeric_dtype(parent_e.df[parent_v])):
child_e.df[child_v] = pd.to_numeric(child_e.df[child_v])

parent_dtype = parent_e.df[parent_v].dtype
child_dtype = child_e.df[child_v].dtype
msg = "Unable to add relationship because {} in {} is Pandas dtype {}"\
" and {} in {} is Pandas dtype {}."
if not is_dtype_equal(parent_dtype, child_dtype):
raise ValueError(msg.format(parent_v, parent_e.name, parent_dtype,
child_v, child_e.name, child_dtype))

self.relationships.append(relationship)
self.index_data(relationship)
return self
Expand Down
11 changes: 0 additions & 11 deletions featuretools/entityset/entity.py
Expand Up @@ -65,7 +65,6 @@ def __init__(self, id, df, entityset, variable_types=None, name=None,
self._verbose = verbose
self.created_index = created_index
self.convert_variable_types(variable_types)
self.attempt_cast_index_to_int(index)
super(Entity, self).__init__(id, entityset, variable_types, name, index,
time_index, secondary_time_index, relationships, already_sorted)

Expand Down Expand Up @@ -97,16 +96,6 @@ def indexed_by(self):
def indexed_by(self, idx):
self.data["indexed_by"] = idx

def attempt_cast_index_to_int(self, index_var):
dtype_name = self.df[index_var].dtype.name
if (dtype_name.find('int') == -1 and
dtype_name.find('object') > -1 or dtype_name.find('categ') > -1):
if isinstance(self.df[index_var].iloc[0], (int, np.int32, np.int64)):
try:
self.df[index_var] = self.df[index_var].astype(int)
except ValueError:
pass

def convert_variable_types(self, variable_types):
for var_id, desired_type in variable_types.items():
type_args = {}
Expand Down
9 changes: 8 additions & 1 deletion featuretools/entityset/entityset.py
Expand Up @@ -562,10 +562,17 @@ def _import_from_dataframe(self,
df = dataframe
for c in df.columns:
if df[c].dtype.name.find('category') > -1:
df[c] = df[c].astype(object)
try:
df[c] = df[c].astype(int)
except ValueError:
df[c] = df[c].astype(object)
if c not in variable_types:
variable_types[c] = vtypes.Categorical
if df.index.dtype.name.find('category') > -1:
try:
df[c] = df[c].astype(int)
except ValueError:
df[c] = df[c].astype(object)
df.index = df.index.astype(object)

self.add_entity(entity_id,
Expand Down
10 changes: 9 additions & 1 deletion featuretools/tests/demo_tests/test_demo_data.py
@@ -1,7 +1,8 @@
import os

from featuretools.demo import load_retail
from featuretools.demo import load_mock_customer, load_retail
from featuretools.demo.retail import make_retail_pathname
from featuretools.synthesis import dfs


def test_load_retail_save():
Expand All @@ -24,3 +25,10 @@ def test_load_retail_diff():
assert es_second['order_products'].df.shape[0] == nrows_second
os.remove(make_retail_pathname(nrows))
os.remove(make_retail_pathname(nrows_second))


def test_mock_customer():
es = load_mock_customer(return_entityset=True)
fm, fl = dfs(entityset=es, target_entity="customers", max_depth=3)
for feature in fl:
assert feature.get_name() in fm.columns
31 changes: 31 additions & 0 deletions featuretools/tests/entityset_tests/test_es.py
Expand Up @@ -30,6 +30,37 @@ def test_add_relationships_convert_type(es):
assert parent_e.df[r.parent_variable.id].dtype == child_e.df[r.child_variable.id].dtype


def test_add_relationship_errors_on_dtype_mismatch(es):
log_2_df = es['log'].df.copy()
log_variable_types = {
'id': variable_types.Categorical,
'session_id': variable_types.Id,
'product_id': variable_types.Id,
'datetime': variable_types.Datetime,
'value': variable_types.Numeric,
'value_2': variable_types.Numeric,
'latlong': variable_types.LatLong,
'latlong2': variable_types.LatLong,
'value_many_nans': variable_types.Numeric,
'priority_level': variable_types.Ordinal,
'purchased': variable_types.Boolean,
'comments': variable_types.Text
}
es.entity_from_dataframe(entity_id='log2',
dataframe=log_2_df,
index='id',
variable_types=log_variable_types,
time_index='datetime',
encoding='utf-8')
with pytest.raises(ValueError) as e:
mismatch = Relationship(es['regions']['id'], es['log2']['session_id'])
es.add_relationship(mismatch)

assert e.value.__str__() == "Unable to add relationship because id in "\
"regions is Pandas dtype object and "\
"session_id in log2 is Pandas dtype int64."


def test_get_forward_entities(es):
entities = es.get_forward_entities('log')
assert entities == set(['sessions', 'products'])
Expand Down
8 changes: 4 additions & 4 deletions featuretools/tests/integration_data/customers.csv
@@ -1,4 +1,4 @@
id,age,region_id,cohort,cohort_name,loves_ice_cream,favorite_quote,signup_date,upgrade_date,cancel_date,cancel_reason,date_of_birth,engagement_level
0,33,United States,0,Early Adopters,True,The proletariat have nothing to lose but their chains,2011-04-08,2011-04-10,2011-06-08,reason_1,1993-03-08,1
1,25,United States,1,Late Adopters,False,Capitalism deprives us all of self-determination,2011-04-09,2011-04-11,2011-10-09,reason_2,1926-08-02,3
2,56,United States,0,Early Adopters,True,All members of the working classes must seize the means of production.,2011-04-06,2011-04-07,2012-01-06,reason_1,1993-04-20,2
age,cancel_date,cancel_reason,cohort,cohort_name,date_of_birth,engagement_level,favorite_quote,id,loves_ice_cream,region_id,signup_date,upgrade_date
33,2011-06-08,reason_1,0,Early Adopters,1993-03-08,1,The proletariat have nothing to lose but their chains,0,True,United States,2011-04-08,2011-04-10
25,2011-10-09,reason_2,1,Late Adopters,1926-08-02,3,Capitalism deprives us all of self-determination,1,False,United States,2011-04-09,2011-04-11
56,2012-01-06,reason_1,0,Early Adopters,1993-04-20,2,All members of the working classes must seize the means of production.,2,True,United States,2011-04-06,2011-04-07
8 changes: 4 additions & 4 deletions featuretools/tests/integration_data/customers_int.csv
@@ -1,4 +1,4 @@
id,age,region_id,cohort,cohort_name,loves_ice_cream,favorite_quote,signup_date,upgrade_date,cancel_date,cancel_reason,date_of_birth,engagement_level
0,33,United States,0,Early Adopters,True,The proletariat have nothing to lose but their chains,6,18,27,reason_1,2,1
1,25,United States,1,Late Adopters,False,Capitalism deprives us all of self-determination,7,26,28,reason_2,1,3
2,56,United States,0,Early Adopters,True,All members of the working classes must seize the means of production.,4,5,29,reason_1,3,2
age,cancel_date,cancel_reason,cohort,cohort_name,date_of_birth,engagement_level,favorite_quote,id,loves_ice_cream,region_id,signup_date,upgrade_date
33,27,reason_1,0,Early Adopters,2,1,The proletariat have nothing to lose but their chains,0,True,United States,6,18
25,28,reason_2,1,Late Adopters,1,3,Capitalism deprives us all of self-determination,1,False,United States,7,26
56,29,reason_1,0,Early Adopters,3,2,All members of the working classes must seize the means of production.,2,True,United States,4,5
44 changes: 22 additions & 22 deletions featuretools/tests/integration_data/log.csv
@@ -1,5 +1,5 @@
id,session_id,product_id,datetime,value,value_2,latlong,latlong2,value_many_nans,priority_level,purchased,comments
0,0,coke zero,2011-04-09 10:30:00,0.0,0.0,"(0, 0)","(0, 0)",,0,True,"
comments,datetime,id,latlong,latlong2,priority_level,product_id,purchased,session_id,value,value_2,value_many_nans
"
When it comes to Coca-Cola products, people tend to be die-hard fans. Many of us know someone who can't go a day without a Diet Coke (or two or three). And while Diet Coke has been a leading sugar-free soft drink since it was first released in 1982, it came to light that young adult males shied away from this beverage — identifying diet cola as a woman's drink. The company's answer to that predicament came in 2005 - in the form of a shiny black can - with the release of Coca-Cola Zero.

While Diet Coke was created with its own flavor profile and not as a sugar-free version of the original, Coca-Cola Zero aims to taste just like the ""real Coke flavor."" Despite their polar opposite advertising campaigns, the contents and nutritional information of the two sugar-free colas is nearly identical. With that information in hand we at HuffPost Taste needed to know: Which of these two artificially-sweetened Coca-Cola beverages actually tastes better? And can you even tell the difference between them?
Expand Down Expand Up @@ -40,10 +40,10 @@ Coca-Cola Zero: ""Has more of a sharply sweet aftertaste I associate with diet s
Overall comments: ""That was a lot more difficult than I though it would be."" ""Both equally palatable."" A few people said Diet Coke tasted much better ... unbeknownst to them, they were actually referring to Coca-Cola Zero.

IN SUMMARY: It is a real toss up. There is not one artificially-sweetened Coca-Cola beverage that outshines the other. So how do people choose between one or the other? It is either a matter of personal taste, or maybe the marketing campaigns will influence their choice.
",2011-04-09 10:30:00,0,"(0, 0)","(0, 0)",0,coke zero,True,0,0.0,0.0,
I loved it,2011-04-09 10:30:06,1,"(5, 2)","(2, -5)",0,coke zero,True,0,5.0,2.0,
I loved it,2011-04-09 10:30:12,2,"(10, 4)","(4, -10)",1,coke zero,True,0,10.0,4.0,
"
1,0,coke zero,2011-04-09 10:30:06,5.0,2.0,"(5, 2)","(2, -5)",,0,True,I loved it
2,0,coke zero,2011-04-09 10:30:12,10.0,4.0,"(10, 4)","(4, -10)",,1,True,I loved it
3,0,car,2011-04-09 10:30:18,15.0,6.0,"(15, 6)","(6, -15)",,1,True,"
The full-size pickup truck and the V-8 engine were supposed to be inseparable, like the internet and cat videos. You can’t have one without the other—or so we thought.

In America’s most popular vehicle, the Ford F-150, two turbocharged six-cylinder engines marketed under the EcoBoost name have dethroned the naturally aspirated V-8. Ford’s new 2.7-liter twin-turbo V-6 is the popular choice, while the 3.5-liter twin-turbo V-6 is the top performer. The larger six allows for greater hauling capacity, accelerates the truck more quickly, and swills less gas in EPA testing than the V-8 alternative. It’s enough to make even old-school truck buyers acknowledge that there actually is a replacement for displacement.
Expand All @@ -63,8 +63,8 @@ For the most part, though, the equipment in this particular Lariat lives up to t
Middle-Child Syndrome

In the F-150, Ford has a trifecta of engines (the fourth, a naturally aspirated 3.5-liter V-6, is best left to the fleet operators). The 2.7-liter twin-turbo V-6 delivers remarkable performance at an affordable price. The 3.5-liter twin-turbo V-6 is the workhorse, with power, torque, and hauling capability to spare. Compared with those two logical options, the middle-child 5.0-liter V-8 is the right-brain choice. Its strongest selling points may be its silky power delivery and the familiar V-8 rumble. That’s a flimsy argument when it comes to rationalizing a $50,000-plus purchase, though, so perhaps it’s no surprise that today’s boosted six-cylinders are now the engines of choice in the F-150.
",2011-04-09 10:30:18,3,"(15, 6)","(6, -15)",1,car,True,0,15.0,6.0,
"
4,0,car,2011-04-09 10:30:24,20.0,8.0,"(20, 8)","(8, -20)",,1,True,"
THE GOOD
The Tesla Model S 90D's electric drivetrain is substantially more efficient than any internal combustion engine, and gives the car smooth and quick acceleration. All-wheel drive comes courtesy of a smart dual motor system. The new Autopilot feature eases the stress of stop-and-go traffic and long road trips.

Expand Down Expand Up @@ -129,8 +129,8 @@ The 2016 Tesla Model S 90D adds features to keep it competitive against the inte
Lengthy charging times mean longer trips are either out of the question or require more planning than with an internal combustion car. And while the infotainment system responds quickly to touch inputs and offers useful screens, it hasn't changed much in four years. Most notably, Tesla hasn't added any music apps beyond the ones it launched with. Along with new, useful apps, it would be nice to have some themes or other aesthetic changes to the infotainment interface.

The Model S 90D's base price of $88,000 puts it out of reach of the average buyer, and the model I drove was optioned up to around $95,000. Against its Audi, BMW and Mercedes-Benz competition, however, it makes a compelling argument, especially for its uncomplicated nature.
",2011-04-09 10:30:24,4,"(20, 8)","(8, -20)",1,car,True,0,20.0,8.0,
"
5,1,toothpaste,2011-04-09 10:31:00,0.0,0.0,"(0, 0)","(0, 0)",0.0,1,True,"
Toothpaste can do more harm than good

The next time a patient innocently asks me, “What’s the best toothpaste to use?” I’m going to unleash a whole Chunky Soup can of “You Want The Truth? You CAN’T HANDLE THE TRUTH!!!” Gosh, that’s such an overused movie quote. Sorry about that, but still.
Expand Down Expand Up @@ -221,13 +221,13 @@ But now I’m tired of talking about toothpaste.
Next topic?

I’m bringing pyorrhea back.
"
6,1,toothpaste,2011-04-09 10:31:09,1.0,1.0,"(1, 1)","(1, -1)",1.0,1,True,"
",2011-04-09 10:31:00,5,"(0, 0)","(0, 0)",1,toothpaste,True,1,0.0,0.0,0.0
"
I’ve been a user of Colgate Total Whitening Toothpaste for many years because I’ve always tried to maintain a healthy smile (I’m a receptionist so I need a white smile). But because I drink coffee at least twice a day (sometimes more!) and a lot of herbal teas, I’ve found that using just this toothpaste alone doesn’t really get my teeth white...

The best way to get white teeth is to really try some professional products specifically for tooth whitening. I’ve tried a few products, like Crest White Strips and found that the strips are really not as good as the trays. Although the Crest White Strips are easy to use, they really DO NOT cover your teeth perfectly like some other professional dental whitening kits. This Product did cover my teeth well however because of their custom heat trays, and whitening my teeth A LOT. I would say if you really want white teeth, use the Colgate Toothpaste and least 2 times a day, along side a professional Gel product like Shine Whitening.
"
7,1,toothpaste,2011-04-09 10:31:18,2.0,2.0,"(2, 2)","(2, -2)",2.0,0,True,"
",2011-04-09 10:31:09,6,"(1, 1)","(1, -1)",1,toothpaste,True,1,1.0,1.0,1.0
"
The first feature is the price, and it is right.

Next, I consider whether it will be neat to use. It is. Sometimes when I buy those new hard plastic containers, they actually get messy. Also I cannot get all the toothpaste out. It is easy to get the paste out of Colgate Total Whitening Paste without spraying it all over the cabinet.
Expand All @@ -239,11 +239,11 @@ Whitening is important. This one is supposed ot whiten. After spending money to
Avoiding all kinds of oral pathology is a major consideration. This toothpaste claims that it can help fight cavities, gingivitis, plaque, tartar, and bad breath.

I hope this product stays on the market a long time and does not change.
"
8,1,brown bag,2011-04-09 10:31:27,3.0,3.0,"(3, 3)","(3, -3)",3.0,0,True,"
",2011-04-09 10:31:18,7,"(2, 2)","(2, -2)",0,toothpaste,True,1,2.0,2.0,2.0
"
These bags looked exactly like I'd hoped, however, the handles broke off of almost every single bag as soon as items were placed in them! I used these as gift bags for out-of-town guests at my wedding, so imagine my embarassment as the handles broke off as I was handing them out. I would not recommend purchaing these bags unless you plan to fill them with nothing but paper! Anything heavier will cause the handles to snap right off.
",2011-04-09 10:31:27,8,"(3, 3)","(3, -3)",0,brown bag,True,1,3.0,3.0,3.0
"
9,2,brown bag,2011-04-09 10:40:00,0.0,0.0,"(0, 0)","(0, 0)",0.0,0,True,"
I purchased these in August 2014 from Big Blue Supplies. I have no problem with the seller, these arrived new condition, fine shape.

I do have a slight problem with the bags. In case someone might want to know, the handles on these bags are set inside against the top. Then a piece of Kraft type packing tape is placed over the handles to hold them in place. On some of the bags, the tape is already starting to peel off. I would be really hesitant about using these bags unless I reinforced the current tape with a different adhesive.
Expand All @@ -257,8 +257,8 @@ Even the dollar store bags I normally purchase do not have that stamped on the b
I do not think I would purchase again for all the reasons stated above.

Another thing for those still wanting to purchase, the ones I received were: 12 3/4 inches high not including handle, 10 1/4 inches wide and a 5 1/4 inch depth.
",2011-04-09 10:40:00,9,"(0, 0)","(0, 0)",0,brown bag,True,2,0.0,0.0,0.0
"
10,3,Haribo sugar-free gummy bears,2011-04-10 10:40:00,0.0,0.0,"(0, 0)","(0, 0)",,0,True,"
The place: BMO Harris Bradley Center
The event: Bucks VS Spurs
The snack: Satan's Diarrhea Hate Bears made by Haribo
Expand All @@ -285,14 +285,14 @@ Not a word was said, but a diaper was thrown over the stall. I catch it, line my

My son asks me, ""Daddy, why are we leaving early?""
""Well son, I need to change my diaper""
",2011-04-10 10:40:00,10,"(0, 0)","(0, 0)",0,Haribo sugar-free gummy bears,True,3,0.0,0.0,
I loved it,2011-04-10 10:40:01,11,"(5, 2)","(2, -5)",0,coke zero,False,3,5.0,2.0,
I loved it,2011-04-10 10:41:00,12,"(0, 0)","(0, 0)",0,coke zero,False,4,0.0,0.0,0.0
I loved it,2011-04-10 10:41:03,13,"(7, 3)","(3, -7)",2,coke zero,False,4,7.0,3.0,3.0
I loved it,2011-04-10 10:41:06,14,"(14, 6)","(6, -14)",2,coke zero,False,4,14.0,6.0,6.0
"
11,3,coke zero,2011-04-10 10:40:01,5.0,2.0,"(5, 2)","(2, -5)",,0,False,I loved it
12,4,coke zero,2011-04-10 10:41:00,0.0,0.0,"(0, 0)","(0, 0)",0.0,0,False,I loved it
13,4,coke zero,2011-04-10 10:41:03,7.0,3.0,"(7, 3)","(3, -7)",3.0,2,False,I loved it
14,4,coke zero,2011-04-10 10:41:06,14.0,6.0,"(14, 6)","(6, -14)",6.0,2,False,I loved it
15,5,taco clock,2011-04-10 11:10:00,,,"(nan, nan)","(nan, nan)",,1,True,"
This timer does what it is supposed to do. Setup is elementary. Replacing the old one (after 12 years) was relatively easy. It has performed flawlessly since. I'm delighted I could find an esoteric product like this at Amazon. Their service, and the customer reviews, are just excellent.
",2011-04-10 11:10:00,15,"(nan, nan)","(nan, nan)",1,taco clock,True,5,,,
"
16,5,taco clock,2011-04-10 11:10:03,,,"(nan, nan)","(nan, nan)",,1,False,"
Funny, cute clock. A little spendy for how light the clock is, but its hard to find a taco clock.
"
",2011-04-10 11:10:03,16,"(nan, nan)","(nan, nan)",1,taco clock,False,5,,,

0 comments on commit 70591f3

Please sign in to comment.