alteryx · eccabay · Aug 11, 2020 · Aug 4, 2020 · Aug 5, 2020 · Aug 5, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,7 +3,10 @@ Release Notes
 
 **Future Releases**
     * Enhancements
+        * Added new LSA component for text featurization :pr:`1022`
     * Fixes
+        * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
+        * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
     * Changes
         * Removed DeprecationWarning for SimpleImputer :pr:`1018`
     * Documentation Changes

diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
@@ -32,5 +32,6 @@
     DropNullColumns,
     DateTimeFeaturizer,
     SelectColumns,
-    TextFeaturizer
+    TextFeaturizer,
+    LSA,
     )
diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py
@@ -5,4 +5,4 @@
 from .imputers import PerColumnImputer, SimpleImputer, Imputer
 from .scalers import StandardScaler
 from .column_selectors import DropColumns, SelectColumns
-from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer
+from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer
diff --git a/evalml/pipelines/components/transformers/preprocessing/__init__.py b/evalml/pipelines/components/transformers/preprocessing/__init__.py
@@ -1,4 +1,5 @@
 # flake8:noqa
 from .datetime_featurizer import DateTimeFeaturizer
 from .drop_null_columns import DropNullColumns
+from .lsa import LSA
 from .text_featurizer import TextFeaturizer
diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -0,0 +1,86 @@
+import warnings
+
+import pandas as pd
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline
+
+from evalml.pipelines.components.transformers import Transformer
+
+
+class LSA(Transformer):
+    """Transformer to calculate the Latent Semantic Analysis Values of text input"""
+    name = "LSA Transformer"
+    hyperparameter_ranges = {}
+
+    def __init__(self, text_columns=None, random_state=0, **kwargs):
+        """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns.
+
+        Arguments:
+            text_colums (list): list of `pd.DataFrame` column names that contain text.
+            random_state (int, np.random.RandomState): Seed for the random number generator.
+        """
+        text_columns = text_columns or []
+        parameters = {'text_columns': text_columns}
+        parameters.update(kwargs)
+
+        for i, col_name in enumerate(text_columns):
+            if not isinstance(col_name, str):
+                text_columns[i] = str(col_name)
+        self.text_col_names = text_columns
+        self.lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
+        super().__init__(parameters=parameters,
+                         component_obj=None,
+                         random_state=random_state)
+
+    def _verify_col_names(self, col_names):
+        missing_cols = []
+        for col in self.text_col_names:
+            if col not in col_names:
+                missing_cols.append(col)
+
+        if len(missing_cols) > 0:
+            if len(missing_cols) == len(self.text_col_names):
+                raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
+            for col in missing_cols:
+                self.text_col_names.remove(col)
+            warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
+
+    def fit(self, X, y=None):
+        if len(self.text_col_names) == 0:
+            warnings.warn("No text columns were given to LSA, component has no effect", RuntimeWarning)
+            return self
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X).rename(columns=str)
+        self._verify_col_names(X.columns)
+
+        corpus = []
+        for col in self.text_col_names:
+            corpus.extend(X[col].values.tolist())
+
+        self.lsa_pipeline.fit(corpus)
+        return self
+
+    def transform(self, X, y=None):
+        """Transforms data X by applying the LSA pipeline.
+        Arguments:
+            X (pd.DataFrame): Data to transform
+            y (pd.Series, optional): Targets
+        Returns:
+            pd.DataFrame: Transformed X
+        """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+        X_t = X
+
+        for col in self.text_col_names:
+            try:
+                transformed = self.lsa_pipeline.transform(X[col])
+                X_t = X_t.drop(labels=col, axis=1)
+            except KeyError:
+                transformed = self.lsa_pipeline.transform(X[int(col)])
+                X_t = X_t.drop(labels=int(col), axis=1)
+
+            X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
+            X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
+        return X_t
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -4,6 +4,7 @@
 import pandas as pd
 
 from evalml.pipelines.components.transformers import Transformer
+from evalml.pipelines.components.transformers.preprocessing import LSA
 from evalml.utils import import_or_raise
 
 
@@ -27,13 +28,12 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         parameters = {'text_columns': text_columns}
         parameters.update(kwargs)
 
-        if len(text_columns) == 0:
-            warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning)
         for i, col_name in enumerate(text_columns):
             if not isinstance(col_name, str):
                 text_columns[i] = str(col_name)
         self.text_col_names = text_columns
         self._features = None
+        self._lsa = LSA(text_columns=text_columns, random_state=random_state)
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
@@ -69,6 +69,7 @@ def _verify_col_types(self, entity_set):
 
     def fit(self, X, y=None):
         if len(self.text_col_names) == 0:
+            warnings.warn("No text columns were given to TextFeaturizer, component has no effect", RuntimeWarning)
             self._features = []
             return self
         if not isinstance(X, pd.DataFrame):
@@ -83,11 +84,11 @@ def fit(self, X, y=None):
         es.df = self._clean_text(X)
 
         trans = [self._nlp_primitives.DiversityScore,
-                 self._nlp_primitives.LSA,
                  self._nlp_primitives.MeanCharactersPerWord,
                  self._nlp_primitives.PartOfSpeechCount,
                  self._nlp_primitives.PolarityScore]
 
+        self._lsa.fit(X)
         self._features = self._ft.dfs(entityset=es,
                                       target_entity='X',
                                       trans_primitives=trans,
@@ -112,6 +113,8 @@ def transform(self, X, y=None):
         self._verify_col_names(X.columns)
 
         X_text = X[self.text_col_names]
+        X_lsa = self._lsa.transform(X_text)
+
         X_text['index'] = range(len(X_text))
         X_t = X.drop(self.text_col_names, axis=1)
 
@@ -123,5 +126,5 @@ def transform(self, X, y=None):
         feature_matrix = self._ft.calculate_feature_matrix(features=self._features,
                                                            entityset=es,
                                                            verbose=True)
-        X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1)
+        X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1)
         return X_t
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
@@ -0,0 +1,128 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from evalml.pipelines.components import LSA
+
+
+@pytest.fixture()
+def text_df():
+    df = pd.DataFrame(
+        {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
+                   'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
+                   'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'],
+         'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
+                   'I dreamed a dream in days gone by, when hope was high and life worth living',
+                   'Red, the blood of angry men - black, the dark of ages past']
+         })
+    yield df
+
+
+def test_lsa_only_text(text_df):
+    X = text_df
+    lsa = LSA(text_columns=['col_1', 'col_2'])
+    lsa.fit(X)
+
+    expected_col_names = set(['LSA(col_1)[0]',
+                              'LSA(col_1)[1]',
+                              'LSA(col_2)[0]',
+                              'LSA(col_2)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_lsa_with_nontext(text_df):
+    X = text_df
+    X['col_3'] = [73.7, 67.213, 92]
+    lsa = LSA(text_columns=['col_1', 'col_2'])
+
+    lsa.fit(X)
+    expected_col_names = set(['LSA(col_1)[0]',
+                              'LSA(col_1)[1]',
+                              'LSA(col_2)[0]',
+                              'LSA(col_2)[1]',
+                              'col_3'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 5
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_lsa_no_text():
+    X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
+    warn_msg = "No text columns were given to LSA, component has no effect"
+    lsa = LSA()
+
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        lsa.fit(X)
+    X_t = lsa.transform(X)
+    assert len(X_t.columns) == 2
+
+
+def test_some_missing_col_names(text_df):
+    X = text_df
+    lsa = LSA(text_columns=['col_1', 'col_2', 'col_3'])
+
+    with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"):
+        lsa.fit(X)
+
+    expected_col_names = set(['LSA(col_1)[0]',
+                              'LSA(col_1)[1]',
+                              'LSA(col_2)[0]',
+                              'LSA(col_2)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_all_missing_col_names(text_df):
+    X = text_df
+    lsa = LSA(text_columns=['col_3', 'col_4'])
+
+    error_msg = "None of the provided text column names match the columns in the given DataFrame"
+    with pytest.raises(RuntimeError, match=error_msg):
+        lsa.fit(X)
+
+
+def test_empty_text_column():
+    X = pd.DataFrame({'col_1': []})
+    lsa = LSA(text_columns=['col_1'])
+    with pytest.raises(ValueError, match="empty vocabulary"):
+        lsa.fit(X)
+
+
+def test_index_col_names():
+    X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'],
+                  ['just singing in the rain.................. \n', 'singing the songs of angry men\n'],
+                  ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']])
+    lsa = LSA(text_columns=[0, 1])
+
+    lsa.fit(X)
+    expected_col_names = set(['LSA(0)[0]',
+                              'LSA(0)[1]',
+                              'LSA(1)[0]',
+                              'LSA(1)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_lsa_output():
+    X = pd.DataFrame(
+        {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
+                 'I dreamed a dream in days gone by, when hope was high and life worth living',
+                 'Red, the blood of angry men - black, the dark of ages past']})
+    lsa = LSA(text_columns=['lsa'])
+    lsa.fit(X)
+
+    expected_features = [[0.832, 0.],
+                         [0., 1.],
+                         [0.832, 0.]]
+    X_t = lsa.transform(X)
+    cols = [col for col in X_t.columns if 'LSA' in col]
+    features = X_t[cols]
+    np.testing.assert_almost_equal(features, expected_features, decimal=3)
diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
@@ -73,11 +73,11 @@ def test_featurizer_with_nontext(text_df):
 
 def test_featurizer_no_text():
     X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
-    warn_msg = "No text columns were given to TextFeaturizer, component will have no effect"
-    with pytest.warns(RuntimeWarning, match=warn_msg):
-        tf = TextFeaturizer()
+    warn_msg = "No text columns were given to TextFeaturizer, component has no effect"
+    tf = TextFeaturizer()
 
-    tf.fit(X)
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        tf.fit(X)
     X_t = tf.transform(X)
     assert len(X_t.columns) == 2
 
@@ -182,9 +182,9 @@ def test_lsa_primitive_output():
     tf = TextFeaturizer(text_columns=['lsa'])
     tf.fit(X)
 
-    expected_features = [[0.0200961, 0.002976],
-                         [0.0223392, 0.0058817],
-                         [0.0186072, -0.0006121]]
+    expected_features = [[0.832, 0.],
+                         [0., 1.],
+                         [0.832, 0.]]
     X_t = tf.transform(X)
     cols = [col for col in X_t.columns if 'LSA' in col]
     features = X_t[cols]

diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py
@@ -12,9 +12,9 @@
 
 def test_all_components(has_minimal_dependencies):
     if has_minimal_dependencies:
-        assert len(all_components) == 22
+        assert len(all_components) == 23
     else:
-        assert len(all_components) == 26
+        assert len(all_components) == 27
 
 
 def test_handle_component_class_names():