Skip to content

Commit

Permalink
Merge pull request #71 from bjherger/inverse_transform
Browse files Browse the repository at this point in the history
Inverse transform
  • Loading branch information
bjherger committed Oct 11, 2018
2 parents a1eccb7 + 06cedc1 commit 632ec9a
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 20 deletions.
46 changes: 28 additions & 18 deletions keras_pandas/Automater.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,24 +191,7 @@ def fit_transform(self, input_dataframe):
"""
return self.fit(input_dataframe).transform(input_dataframe)

def get_transformers(self):
# TODO
pass

def get_transformer(self, variable):
# TODO
pass

def list_default_transformation_pipelines(self):
# TODO
pass

def _check_input_dataframe_columns_(self, input_dataframe):
# TODO
pass

def _check_output_dataframe_columns_(self, output_dataframe):
# TODO
def _get_variable_type(self, variable_type_dict, variable):
pass

def _create_input_nub(self, variable_type_dict, input_dataframe):
Expand Down Expand Up @@ -399,3 +382,30 @@ def _suggest_loss(self, variable_type_dict, y):

return suggested_loss

def _inverse_transform_output(self, y):
"""
:param y:
:return:
"""
# Find response variable's variable type
response_variable_types = lib.get_variable_type(self.response_var, self._variable_type_dict, self.response_var)
response_variable_type = response_variable_types[0]
logging.info('Found response variable type: {}'.format(response_variable_type))

# Get transformation pipeline for response variable
response_transform_tuple = list(filter(lambda x: x[0][0] == self.response_var, self.output_mapper.built_features))[0]
response_transform_pipeline = response_transform_tuple[1]
logging.info('response_transform_pipeline" {}'.format(response_transform_pipeline))

# Parse and inverse transform y based on response variable type
if response_variable_type is 'numerical_vars':
response_scaler = response_transform_pipeline.named_steps['standardscaler']
logging.info('Standard scaler trained for response_var. scale_: {}, mean_: {}, var_: {}'.
format(response_scaler.scale_, response_scaler.mean_, response_scaler.var_))
else:
raise ValueError('Unable to perform inverse transform for response variable\s data type: {}'.format(response_variable_type))

natural_scaled_vars = response_scaler.inverse_transform(y)
return natural_scaled_vars

1 change: 1 addition & 0 deletions keras_pandas/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def check_variable_list_are_valid(variable_type_dict):


def get_variable_type(variable_name, variable_type_dict, response_var):
# TODO This seems unnecessary. We should be able to get the variable type for any variable with this function
if variable_name is not response_var:
raise KeyError('Provided variable: {} not in response variable: {}'.format(variable_name,
response_var))
Expand Down
125 changes: 123 additions & 2 deletions tests/testautomater.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@
import logging
import unittest

import numpy
import pandas
from keras import Model
from keras.layers import Dense

from keras_pandas import lib
from keras_pandas.Automater import Automater
from tests.testbase import TestBase

logging.getLogger().setLevel(logging.INFO)
logging.getLogger().setLevel(logging.DEBUG)


class TestAutomater(TestBase):
Expand Down Expand Up @@ -120,4 +125,120 @@ def test_initializer(self):
categorical_vars=data['categorical_vars'],
datetime_vars=data['datetime_vars'])

# TODO Test that df_out is captured correctly

def test_inverse_transform_numerical_response(self):

# :oad data
observations = lib.load_lending_club()

# Set to test run
observations = observations.sample(n=100)

# Declare variable types
categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status',
'issue_d',
'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type',
'disbursement_method', 'loan_status']
numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti',
'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
'int_rate', 'revol_util']

text_vars = ['desc', 'title']

# Manual null filling
for categorical_var in categorical_vars:
observations[categorical_var] = observations[categorical_var].fillna('None')
observations[categorical_var] = observations[categorical_var].apply(str)

auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars,
response_var='funded_amnt')

X, y = auto.fit_transform(observations)

# Start model with provided input nub
x = auto.input_nub

# Fill in your own hidden layers
x = Dense(8)(x)
x = Dense(16, activation='relu')(x)
x = Dense(8)(x)

# End model with provided output nub
x = auto.output_nub(x)

model = Model(inputs=auto.input_layers, outputs=x)
model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

# Train model
logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU')
model.fit(X, y, epochs=1, validation_split=.2)
unscaled_preds = model.predict(X)

logging.debug('unscaled_preds: {}'.format(list(unscaled_preds)))

scaled_preds = auto._inverse_transform_output(unscaled_preds)

logging.debug('scaled_preds: {}'.format(list(scaled_preds)))

self.assertNotAlmostEquals(0, numpy.mean(scaled_preds))

self.assertNotAlmostEquals(1, numpy.std(scaled_preds))

def test_inverse_transform_numerical_response(self):

# :oad data
observations = lib.load_lending_club()

# Set to test run
observations = observations.sample(n=100)

# Declare variable types
categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status',
'issue_d',
'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type',
'disbursement_method', 'loan_status']
numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti',
'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
'int_rate', 'revol_util']

text_vars = ['desc', 'title']

# Manual null filling
for categorical_var in categorical_vars:
observations[categorical_var] = observations[categorical_var].fillna('None')
observations[categorical_var] = observations[categorical_var].apply(str)

auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars,
response_var='funded_amnt')

X, y = auto.fit_transform(observations)

# Start model with provided input nub
x = auto.input_nub

# Fill in your own hidden layers
x = Dense(8)(x)
x = Dense(16, activation='relu')(x)
x = Dense(8)(x)

# End model with provided output nub
x = auto.output_nub(x)

model = Model(inputs=auto.input_layers, outputs=x)
model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

# Train model
logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU')
model.fit(X, y, epochs=1, validation_split=.2)
unscaled_preds = model.predict(X)

logging.debug('unscaled_preds: {}'.format(list(unscaled_preds)))

scaled_preds = auto._inverse_transform_output(unscaled_preds)

logging.debug('scaled_preds: {}'.format(list(scaled_preds)))

self.assertNotAlmostEquals(0, numpy.mean(scaled_preds))

self.assertNotAlmostEquals(1, numpy.std(scaled_preds))

0 comments on commit 632ec9a

Please sign in to comment.