alteryx · eccabay · Aug 11, 2020 · Aug 4, 2020 · Aug 5, 2020 · Aug 5, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -5,7 +5,10 @@ Release Notes
     * Enhancements
         * Split `fill_value` into `categorical_fill_value` and `numeric_fill_value` for Imputer :pr:`1019`
         * Added `explain_predictions` and `explain_predictions_best_worst` for explaining multiple predictions with SHAP :pr:`1016`
+        * Added new LSA component for text featurization :pr:`1022`
     * Fixes
+        * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
+        * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
     * Changes
     * Documentation Changes
         * Update setup.py URL to point to the github repo :pr:`1037`

diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
@@ -32,5 +32,6 @@
     DropNullColumns,
     DateTimeFeaturizer,
     SelectColumns,
-    TextFeaturizer
+    TextFeaturizer,
+    LSA,
     )
diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py
@@ -5,4 +5,4 @@
 from .imputers import PerColumnImputer, SimpleImputer, Imputer
 from .scalers import StandardScaler
 from .column_selectors import DropColumns, SelectColumns
-from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer
+from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer
diff --git a/evalml/pipelines/components/transformers/preprocessing/__init__.py b/evalml/pipelines/components/transformers/preprocessing/__init__.py
@@ -1,4 +1,5 @@
 # flake8:noqa
 from .datetime_featurizer import DateTimeFeaturizer
 from .drop_null_columns import DropNullColumns
+from .lsa import LSA
 from .text_featurizer import TextFeaturizer
diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -0,0 +1,75 @@
+import logging
+
+import pandas as pd
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline
+
+from evalml.pipelines.components.transformers import Transformer
+
+logger = logging.getLogger()
+
+
+class LSA(Transformer):
+    """Transformer to calculate the Latent Semantic Analysis Values of text input"""
+    name = "LSA Transformer"
+    hyperparameter_ranges = {}
+
+    def __init__(self, text_columns=None, random_state=0, **kwargs):
+        """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns.
+
+        Arguments:
+            text_columns (list): list of feature names which should be treated as text features.
+            random_state (int, np.random.RandomState): Seed for the random number generator.
+        """
+        parameters = {'text_columns': text_columns}
+        text_columns = text_columns or []
+        parameters.update(kwargs)
+
+        self._text_col_names = text_columns
+        self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
+        super().__init__(parameters=parameters,
+                         component_obj=None,
+                         random_state=random_state)
+
+    def _verify_col_names(self, col_names):
+        missing_cols = [col for col in self._text_col_names if col not in col_names]
+
+        if len(missing_cols) > 0:
+            if len(missing_cols) == len(self._text_col_names):
+                raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
+            for col in missing_cols:
+                self._text_col_names.remove(col)
+            logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols))
+
+    def fit(self, X, y=None):
+        if len(self._text_col_names) == 0:
+            return self
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+        self._verify_col_names(X.columns)
+
+        corpus = X[self._text_col_names].values.flatten()
+        self._lsa_pipeline.fit(corpus)
+        return self
+
+    def transform(self, X, y=None):
+        """Transforms data X by applying the LSA pipeline.
+        Arguments:
+            X (pd.DataFrame): Data to transform
+            y (pd.Series, optional): Targets
+        Returns:
+            pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the
+                          format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
+        """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+        X_t = X
+
+        for col in self._text_col_names:
+            transformed = self._lsa_pipeline.transform(X[col])
+
+            X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
+            X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
+        X_t = X_t.drop(columns=self._text_col_names)
+        return X_t
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -1,11 +1,14 @@
+import logging
 import string
-import warnings
 
 import pandas as pd
 
 from evalml.pipelines.components.transformers import Transformer
+from evalml.pipelines.components.transformers.preprocessing import LSA
 from evalml.utils import import_or_raise
 
+logger = logging.getLogger()
+
 
 class TextFeaturizer(Transformer):
     """Transformer that can automatically featurize text columns."""
@@ -16,24 +19,20 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         """Extracts features from text columns using featuretools' nlp_primitives
 
         Arguments:
-            text_colums (list): list of `pd.DataFrame` column names that contain text.
+            text_columns (list): list of feature names which should be treated as text features.
             random_state (int, np.random.RandomState): Seed for the random number generator.
 
         """
         self._ft = import_or_raise("featuretools", error_msg="Package featuretools is not installed. Please install using `pip install featuretools[nlp_primitives].`")
         self._nlp_primitives = import_or_raise("nlp_primitives", error_msg="Package nlp_primitives is not installed. Please install using `pip install featuretools[nlp_primitives].`")
 
-        text_columns = text_columns or []
         parameters = {'text_columns': text_columns}
+        text_columns = text_columns or []
         parameters.update(kwargs)
 
-        if len(text_columns) == 0:
-            warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning)
-        for i, col_name in enumerate(text_columns):
-            if not isinstance(col_name, str):
-                text_columns[i] = str(col_name)
-        self.text_col_names = text_columns
         self._features = None
+        self._lsa = LSA(text_columns=text_columns, random_state=random_state)
+        self._text_col_names = text_columns
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
@@ -44,50 +43,47 @@ def normalize(text):
             text = text.translate(str.maketrans('', '', string.punctuation))
             return text.lower()
 
-        for text_col in self.text_col_names:
+        for text_col in self._text_col_names:
             X[text_col] = X[text_col].apply(normalize)
         return X
 
     def _verify_col_names(self, col_names):
-        missing_cols = []
-        for col in self.text_col_names:
-            if col not in col_names:
-                missing_cols.append(col)
+        missing_cols = [col for col in self._text_col_names if col not in col_names]
 
         if len(missing_cols) > 0:
-            if len(missing_cols) == len(self.text_col_names):
+            if len(missing_cols) == len(self._text_col_names):
                 raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
             for col in missing_cols:
-                self.text_col_names.remove(col)
-            warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
+                self._text_col_names.remove(col)
+            logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols))
 
     def _verify_col_types(self, entity_set):
         var_types = entity_set.entities[0].variable_types
-        for col in self.text_col_names:
-            if var_types[col] is not self._ft.variable_types.variable.Text:
+        for col in self._text_col_names:
+            if var_types[str(col)] is not self._ft.variable_types.variable.Text:
                 raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col))
 
     def fit(self, X, y=None):
-        if len(self.text_col_names) == 0:
+        if len(self._text_col_names) == 0:
             self._features = []
             return self
         if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X).rename(columns=str)
+            X = pd.DataFrame(X)
         self._verify_col_names(X.columns)
-        X_text = X[self.text_col_names]
+        X_text = X[self._text_col_names]
         X_text['index'] = range(len(X_text))
 
         es = self._ft.EntitySet()
-        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
+        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index')
         self._verify_col_types(es)
         es.df = self._clean_text(X)
 
         trans = [self._nlp_primitives.DiversityScore,
-                 self._nlp_primitives.LSA,
                  self._nlp_primitives.MeanCharactersPerWord,
                  self._nlp_primitives.PartOfSpeechCount,
                  self._nlp_primitives.PolarityScore]
 
+        self._lsa.fit(X)
         self._features = self._ft.dfs(entityset=es,
                                       target_entity='X',
                                       trans_primitives=trans,
@@ -108,20 +104,21 @@ def transform(self, X, y=None):
             X = pd.DataFrame(X)
         if self._features is None or len(self._features) == 0:
             return X
-        X = X.rename(columns=str)
         self._verify_col_names(X.columns)
 
-        X_text = X[self.text_col_names]
+        X_text = X[self._text_col_names]
+        X_lsa = self._lsa.transform(X_text)
+
         X_text['index'] = range(len(X_text))
-        X_t = X.drop(self.text_col_names, axis=1)
+        X_t = X.drop(self._text_col_names, axis=1)
 
         es = self._ft.EntitySet()
-        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
+        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index')
         self._verify_col_types(es)
         es.df = self._clean_text(X)
 
         feature_matrix = self._ft.calculate_feature_matrix(features=self._features,
                                                            entityset=es,
                                                            verbose=True)
-        X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1)
+        X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1)
         return X_t