Merge pull request #71 from bjherger/inverse_transform

Inverse transform
bjherger · Oct 11, 2018 · 632ec9a · 632ec9a
2 parents a1eccb7 + 06cedc1
commit 632ec9a
Show file tree

Hide file tree

Showing 3 changed files with 152 additions and 20 deletions.
diff --git a/keras_pandas/Automater.py b/keras_pandas/Automater.py
@@ -191,24 +191,7 @@ def fit_transform(self, input_dataframe):
         """
         return self.fit(input_dataframe).transform(input_dataframe)
 
-    def get_transformers(self):
-        # TODO
-        pass
-
-    def get_transformer(self, variable):
-        # TODO
-        pass
-
-    def list_default_transformation_pipelines(self):
-        # TODO
-        pass
-
-    def _check_input_dataframe_columns_(self, input_dataframe):
-        # TODO
-        pass
-
-    def _check_output_dataframe_columns_(self, output_dataframe):
-        # TODO
+    def _get_variable_type(self, variable_type_dict, variable):
         pass
 
     def _create_input_nub(self, variable_type_dict, input_dataframe):
@@ -399,3 +382,30 @@ def _suggest_loss(self, variable_type_dict, y):
 
         return suggested_loss
 
+    def _inverse_transform_output(self, y):
+        """
+
+        :param y:
+        :return:
+        """
+        # Find response variable's variable type
+        response_variable_types = lib.get_variable_type(self.response_var, self._variable_type_dict, self.response_var)
+        response_variable_type = response_variable_types[0]
+        logging.info('Found response variable type: {}'.format(response_variable_type))
+
+        # Get transformation pipeline for response variable
+        response_transform_tuple = list(filter(lambda x: x[0][0] == self.response_var, self.output_mapper.built_features))[0]
+        response_transform_pipeline = response_transform_tuple[1]
+        logging.info('response_transform_pipeline" {}'.format(response_transform_pipeline))
+
+        # Parse and inverse transform y based on response variable type
+        if response_variable_type is 'numerical_vars':
+            response_scaler = response_transform_pipeline.named_steps['standardscaler']
+            logging.info('Standard scaler trained for response_var. scale_: {}, mean_: {}, var_: {}'.
+                         format(response_scaler.scale_, response_scaler.mean_, response_scaler.var_))
+        else:
+            raise ValueError('Unable to perform inverse transform for response variable\s data type: {}'.format(response_variable_type))
+
+        natural_scaled_vars = response_scaler.inverse_transform(y)
+        return natural_scaled_vars
+
diff --git a/keras_pandas/lib.py b/keras_pandas/lib.py
@@ -34,6 +34,7 @@ def check_variable_list_are_valid(variable_type_dict):
 
 
 def get_variable_type(variable_name, variable_type_dict, response_var):
+    # TODO This seems unnecessary. We should be able to get the variable type for any variable with this function
     if variable_name is not response_var:
         raise KeyError('Provided variable: {} not in response variable: {}'.format(variable_name,
                                                                                    response_var))

diff --git a/tests/testautomater.py b/tests/testautomater.py
@@ -2,11 +2,16 @@
 import logging
 import unittest
 
+import numpy
+import pandas
+from keras import Model
+from keras.layers import Dense
+
 from keras_pandas import lib
 from keras_pandas.Automater import Automater
 from tests.testbase import TestBase
 
-logging.getLogger().setLevel(logging.INFO)
+logging.getLogger().setLevel(logging.DEBUG)
 
 
 class TestAutomater(TestBase):
@@ -120,4 +125,120 @@ def test_initializer(self):
                           categorical_vars=data['categorical_vars'],
                           datetime_vars=data['datetime_vars'])
 
-        # TODO Test that df_out is captured correctly
+
+    def test_inverse_transform_numerical_response(self):
+
+        # :oad data
+        observations = lib.load_lending_club()
+
+        # Set to test run
+        observations = observations.sample(n=100)
+
+        # Declare variable types
+        categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status',
+                            'issue_d',
+                            'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type',
+                            'disbursement_method', 'loan_status']
+        numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti',
+                          'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
+                          'int_rate', 'revol_util']
+
+        text_vars = ['desc', 'title']
+
+        # Manual null filling
+        for categorical_var in categorical_vars:
+            observations[categorical_var] = observations[categorical_var].fillna('None')
+            observations[categorical_var] = observations[categorical_var].apply(str)
+
+        auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars,
+                         response_var='funded_amnt')
+
+        X, y = auto.fit_transform(observations)
+
+        # Start model with provided input nub
+        x = auto.input_nub
+
+        # Fill in your own hidden layers
+        x = Dense(8)(x)
+        x = Dense(16, activation='relu')(x)
+        x = Dense(8)(x)
+
+        # End model with provided output nub
+        x = auto.output_nub(x)
+
+        model = Model(inputs=auto.input_layers, outputs=x)
+        model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])
+
+        # Train model
+        logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU')
+        model.fit(X, y, epochs=1, validation_split=.2)
+        unscaled_preds = model.predict(X)
+
+        logging.debug('unscaled_preds: {}'.format(list(unscaled_preds)))
+
+        scaled_preds = auto._inverse_transform_output(unscaled_preds)
+
+        logging.debug('scaled_preds: {}'.format(list(scaled_preds)))
+
+        self.assertNotAlmostEquals(0, numpy.mean(scaled_preds))
+
+        self.assertNotAlmostEquals(1, numpy.std(scaled_preds))
+
+    def test_inverse_transform_numerical_response(self):
+
+        # :oad data
+        observations = lib.load_lending_club()
+
+        # Set to test run
+        observations = observations.sample(n=100)
+
+        # Declare variable types
+        categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status',
+                            'issue_d',
+                            'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type',
+                            'disbursement_method', 'loan_status']
+        numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti',
+                          'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
+                          'int_rate', 'revol_util']
+
+        text_vars = ['desc', 'title']
+
+        # Manual null filling
+        for categorical_var in categorical_vars:
+            observations[categorical_var] = observations[categorical_var].fillna('None')
+            observations[categorical_var] = observations[categorical_var].apply(str)
+
+        auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars,
+                         response_var='funded_amnt')
+
+        X, y = auto.fit_transform(observations)
+
+        # Start model with provided input nub
+        x = auto.input_nub
+
+        # Fill in your own hidden layers
+        x = Dense(8)(x)
+        x = Dense(16, activation='relu')(x)
+        x = Dense(8)(x)
+
+        # End model with provided output nub
+        x = auto.output_nub(x)
+
+        model = Model(inputs=auto.input_layers, outputs=x)
+        model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])
+
+        # Train model
+        logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU')
+        model.fit(X, y, epochs=1, validation_split=.2)
+        unscaled_preds = model.predict(X)
+
+        logging.debug('unscaled_preds: {}'.format(list(unscaled_preds)))
+
+        scaled_preds = auto._inverse_transform_output(unscaled_preds)
+
+        logging.debug('scaled_preds: {}'.format(list(scaled_preds)))
+
+        self.assertNotAlmostEquals(0, numpy.mean(scaled_preds))
+
+        self.assertNotAlmostEquals(1, numpy.std(scaled_preds))
+