Skip to content

Commit

Permalink
Merge pull request #76 from bjherger/bool
Browse files Browse the repository at this point in the history
Taking out booleans
  • Loading branch information
bjherger committed Oct 18, 2018
2 parents a3725d8 + 5f9954a commit 1ef4ac2
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ board games with my partner in Seattle.

### Development

- Nothing here yet
- Boolean support deprecated. Boolean (bool) data type can be treated as a special case of categorical data types

### 2.0.2

Expand Down
4 changes: 2 additions & 2 deletions keras_pandas/Automater.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def __init__(self, numerical_vars=list(), categorical_vars=list(), boolean_vars=
# Set up variable type dict, with entries <variable_type, list of variables>
self._variable_type_dict = dict()
self._variable_type_dict['numerical_vars'] = numerical_vars
self._variable_type_dict['categorical_vars'] = categorical_vars
self._variable_type_dict['boolean_vars'] = boolean_vars
# Categorical variables include both categorical and boolean
self._variable_type_dict['categorical_vars'] = categorical_vars + boolean_vars
self._variable_type_dict['datetime_vars'] = datetime_vars
self._variable_type_dict['text_vars'] = text_vars
self._variable_type_dict['non_transformed_vars'] = non_transformed_vars
Expand Down
8 changes: 3 additions & 5 deletions keras_pandas/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,21 @@
from keras.layers import Embedding, Flatten, Bidirectional, LSTM
from sklearn.preprocessing import Imputer, StandardScaler

from keras_pandas.transformations import EmbeddingVectorizer, CategoricalImputer, LabelEncoder
from keras_pandas.transformations import EmbeddingVectorizer, CategoricalImputer, LabelEncoder, StringEncoder

default_sklearn_mapper_pipelines = defaultdict(lambda: list())

default_sklearn_mapper_pipelines.update({
'numerical_vars': [Imputer(strategy='mean'), StandardScaler()],
'categorical_vars': [CategoricalImputer(strategy='constant', fill_value='UNK', fill_unknown_labels=True),
'categorical_vars': [StringEncoder(), CategoricalImputer(strategy='constant', fill_value='UNK', fill_unknown_labels=True),
LabelEncoder()],
'boolean_vars': [LabelEncoder()],
'text_vars': [EmbeddingVectorizer()],
'text_vars': [StringEncoder(), EmbeddingVectorizer()],
'non_transformed_vars': []
})

default_suggested_losses = {
'numerical_vars': losses.mean_squared_error,
'categorical_vars': losses.sparse_categorical_crossentropy,
'boolean_vars': losses.sparse_categorical_crossentropy
}


Expand Down
13 changes: 12 additions & 1 deletion keras_pandas/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,4 +402,15 @@ def inverse_transform(self, y):
if diff:
raise ValueError("y contains new labels: %s" % str(diff))
y = numpy.asarray(y)
return self.classes_[y]
return self.classes_[y]

class StringEncoder(BaseEstimator, TransformerMixin):

def __init__(self):
pass

def fit(self, X, y=None):
return self

def transform(self, X):
return X.astype(str)
4 changes: 1 addition & 3 deletions tests/testautomater.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_initializer(self):
# Base case: No variables
auto = Automater()
self.assertEqual({'numerical_vars': list(), 'categorical_vars': list(),
'boolean_vars': list(), 'datetime_vars': list(), 'text_vars': list(),
'datetime_vars': list(), 'text_vars': list(),
'non_transformed_vars': list()}, auto._variable_type_dict, )
self.assertCountEqual(list(), auto._user_provided_variables)

Expand All @@ -96,7 +96,6 @@ def test_initializer(self):
}

response = copy.deepcopy(data)
response['boolean_vars'] = list()
response['non_transformed_vars'] = list()
response['text_vars'] = list()

Expand All @@ -116,7 +115,6 @@ def test_initializer(self):
}

response = copy.deepcopy(data)
response['boolean_vars'] = list()
response['non_transformed_vars'] = list()

self.assertRaises(ValueError, Automater().__init__(), numerical_vars=data['numerical_vars'],
Expand Down
39 changes: 39 additions & 0 deletions tests/testcategorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,45 @@ def test_create_input_nub_numerical(self):
_create_input_nub(variable_type_dict, train_df)
self.assertEqual(3, len(input_layers))

def test_boolean(self):
observations = lib.load_mushroom()
observations['population_bool'] = observations['population'] == 's'

msk = numpy.random.rand(len(observations)) < 0.95
mushroom_train = observations[msk]
mushroom_test = observations[~msk]

categorical_vars = ['odor', 'habitat', 'class']
boolean_vars = ['population_bool']

auto = Automater(categorical_vars=categorical_vars, boolean_vars=boolean_vars, response_var='class')

auto.fit(mushroom_train)
X_train, y_train = auto.transform(mushroom_train)

# Extract input_nub from auto
input_nub = auto.input_nub

# Extract output_nub from auto
output_nub = auto.output_nub

# Create DL model
x = input_nub
x = Dense(30)(x)
x = output_nub(x)

model = Model(inputs=auto.input_layers, outputs=x)
model.compile(optimizer='Adam', loss=auto.loss)

# Train DL model
model.fit(X_train, y_train)

# Transform test set
mushroom_test = mushroom_test.drop('class', axis=1)
X_test, y_test = auto.transform(mushroom_test)
model.predict(X_test)


def test_categorical_whole(self):
# St up data set
mushroom_df = lib.load_mushroom()
Expand Down

0 comments on commit 1ef4ac2

Please sign in to comment.