From 0ba5627109d00d29549b01de430a0783a832efdc Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Tue, 4 Aug 2020 16:04:58 -0400
Subject: [PATCH 01/10] Add LSA component

---
 evalml/pipelines/components/__init__.py       |   3 +-
 .../components/transformers/__init__.py       |   2 +-
 .../transformers/preprocessing/__init__.py    |   1 +
 .../transformers/preprocessing/lsa.py         |  86 ++++++++++++
 evalml/tests/component_tests/test_lsa.py      | 128 ++++++++++++++++++
 5 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 evalml/pipelines/components/transformers/preprocessing/lsa.py
 create mode 100644 evalml/tests/component_tests/test_lsa.py

diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
index edf5e14a13..79f627cfa2 100644
--- a/evalml/pipelines/components/__init__.py
+++ b/evalml/pipelines/components/__init__.py
@@ -32,5 +32,6 @@
     DropNullColumns,
     DateTimeFeaturizer,
     SelectColumns,
-    TextFeaturizer
+    TextFeaturizer,
+    LSA
     )
diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py
index 81f20cb123..7fa863f738 100644
--- a/evalml/pipelines/components/transformers/__init__.py
+++ b/evalml/pipelines/components/transformers/__init__.py
@@ -5,4 +5,4 @@
 from .imputers import PerColumnImputer, SimpleImputer, Imputer
 from .scalers import StandardScaler
 from .column_selectors import DropColumns, SelectColumns
-from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer
+from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer
diff --git a/evalml/pipelines/components/transformers/preprocessing/__init__.py b/evalml/pipelines/components/transformers/preprocessing/__init__.py
index a680656d10..2ad5a2f9dc 100644
--- a/evalml/pipelines/components/transformers/preprocessing/__init__.py
+++ b/evalml/pipelines/components/transformers/preprocessing/__init__.py
@@ -1,4 +1,5 @@
 # flake8:noqa
 from .datetime_featurizer import DateTimeFeaturizer
 from .drop_null_columns import DropNullColumns
+from .lsa import LSA
 from .text_featurizer import TextFeaturizer
diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
new file mode 100644
index 0000000000..a0d2dd0b5e
--- /dev/null
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -0,0 +1,86 @@
+import warnings
+
+import pandas as pd
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline
+
+from evalml.pipelines.components.transformers import Transformer
+
+
+class LSA(Transformer):
+    """Transformer to calculate the Latent Semantic Analysis Values of text input"""
+    name = "LSA Transformer"
+    hyperparameter_ranges = {}
+
+    def __init__(self, text_columns=None, random_state=0, **kwargs):
+        """Initalizes an transformer to perform TF-IDF transformation and Singular Value Decomposition.
+
+        Arguments:
+            training_corpus(iterable): The collection of documents to fit this component on. Any iterable
+            that yields str or unicode objects can be passed in, the simplest format being a 1-dimensional
+            list, numpy array, or pandas Series. If no document is passed in, the component will be trained
+            on (nltk's brown sentence corpus.)[https://www.nltk.org/book/ch02.html#brown-corpus].
+            random_state(int): A seed for the random state.
+        """
+        text_columns = text_columns or []
+        parameters = {'text_columns': text_columns}
+        parameters.update(kwargs)
+
+        if len(text_columns) == 0:
+            warnings.warn("No text columns were given to LSA, component will have no effect", RuntimeWarning)
+        for i, col_name in enumerate(text_columns):
+            if not isinstance(col_name, str):
+                text_columns[i] = str(col_name)
+        self.text_col_names = text_columns
+        self.lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
+        super().__init__(parameters=parameters,
+                         component_obj=None,
+                         random_state=random_state)
+
+    def _verify_col_names(self, col_names):
+        missing_cols = []
+        for col in self.text_col_names:
+            if col not in col_names:
+                missing_cols.append(col)
+
+        if len(missing_cols) > 0:
+            if len(missing_cols) == len(self.text_col_names):
+                raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
+            for col in missing_cols:
+                self.text_col_names.remove(col)
+            warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
+
+    def fit(self, X, y=None):
+        if len(self.text_col_names) == 0:
+            return self
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X).rename(columns=str)
+        self._verify_col_names(X.columns)
+
+        corpus = []
+        for col in self.text_col_names:
+            corpus.extend(X[col].values.tolist())
+
+        self.lsa_pipeline.fit(corpus)
+        return self
+
+    def transform(self, X, y=None):
+        """Transforms data X by applying the LSA pipeline.
+        Arguments:
+            X (pd.DataFrame): Data to transform
+            y (pd.Series, optional): Targets
+        Returns:
+            pd.DataFrame: Transformed X
+        """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X).rename(columns=str)
+        X_t = X
+
+        for col in self.text_col_names:
+            transformed = self.lsa_pipeline.transform(X[col])
+            X_t = X_t.drop(labels=col, axis=1)
+
+            X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
+            X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
+        return X_t
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
new file mode 100644
index 0000000000..cd39d93fb3
--- /dev/null
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -0,0 +1,128 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from evalml.pipelines.components import LSA
+
+
+@pytest.fixture()
+def text_df():
+    df = pd.DataFrame(
+        {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
+                   'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
+                   'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'],
+         'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
+                   'I dreamed a dream in days gone by, when hope was high and life worth living',
+                   'Red, the blood of angry men - black, the dark of ages past']
+         })
+    yield df
+
+
+def test_lsa_only_text(text_df):
+    X = text_df
+    lsa = LSA(text_columns=['col_1', 'col_2'])
+    lsa.fit(X)
+
+    expected_col_names = set(['LSA(col_1)[0]',
+                              'LSA(col_1)[1]',
+                              'LSA(col_2)[0]',
+                              'LSA(col_2)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_lsa_with_nontext(text_df):
+    X = text_df
+    X['col_3'] = [73.7, 67.213, 92]
+    lsa = LSA(text_columns=['col_1', 'col_2'])
+
+    lsa.fit(X)
+    expected_col_names = set(['LSA(col_1)[0]',
+                              'LSA(col_1)[1]',
+                              'LSA(col_2)[0]',
+                              'LSA(col_2)[1]',
+                              'col_3'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 5
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_featurizer_no_text():
+    X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
+    warn_msg = "No text columns were given to LSA, component will have no effect"
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        lsa = LSA()
+
+    lsa.fit(X)
+    X_t = lsa.transform(X)
+    assert len(X_t.columns) == 2
+
+
+def test_some_missing_col_names(text_df):
+    X = text_df
+    lsa = LSA(text_columns=['col_1', 'col_2', 'col_3'])
+
+    with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"):
+        lsa.fit(X)
+
+    expected_col_names = set(['LSA(col_1)[0]',
+                              'LSA(col_1)[1]',
+                              'LSA(col_2)[0]',
+                              'LSA(col_2)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_all_missing_col_names(text_df):
+    X = text_df
+    lsa = LSA(text_columns=['col_3', 'col_4'])
+
+    error_msg = "None of the provided text column names match the columns in the given DataFrame"
+    with pytest.raises(RuntimeError, match=error_msg):
+        lsa.fit(X)
+
+
+def test_empty_text_column():
+    X = pd.DataFrame({'col_1': []})
+    lsa = LSA(text_columns=['col_1'])
+    with pytest.raises(ValueError, match="empty vocabulary"):
+        lsa.fit(X)
+
+
+def test_index_col_names():
+    X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'],
+                  ['just singing in the rain.................. \n', 'singing the songs of angry men\n'],
+                  ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']])
+    lsa = LSA(text_columns=[0, 1])
+
+    lsa.fit(X)
+    expected_col_names = set(['LSA(0)[0]',
+                              'LSA(0)[1]',
+                              'LSA(1)[0]',
+                              'LSA(1)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_lsa_output():
+    X = pd.DataFrame(
+        {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
+                 'I dreamed a dream in days gone by, when hope was high and life worth living',
+                 'Red, the blood of angry men - black, the dark of ages past']})
+    lsa = LSA(text_columns=['lsa'])
+    lsa.fit(X)
+
+    expected_features = [[0.832, 0.],
+                         [0., 1.],
+                         [0.832, 0.]]
+    X_t = lsa.transform(X)
+    cols = [col for col in X_t.columns if 'LSA' in col]
+    features = X_t[cols]
+    np.testing.assert_almost_equal(features, expected_features, decimal=3)

From da832057cd5fdf0b4bd85fcc0420dfb12559f479 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Wed, 5 Aug 2020 08:36:43 -0400
Subject: [PATCH 02/10] Integrate LSA component into TextFeaturizer

---
 evalml/pipelines/components/__init__.py                   | 2 +-
 .../transformers/preprocessing/text_featurizer.py         | 8 ++++++--
 evalml/tests/component_tests/test_text_featurizer.py      | 6 +++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py
index 79f627cfa2..b42de8ebe6 100644
--- a/evalml/pipelines/components/__init__.py
+++ b/evalml/pipelines/components/__init__.py
@@ -33,5 +33,5 @@
     DateTimeFeaturizer,
     SelectColumns,
     TextFeaturizer,
-    LSA
+    LSA,
     )
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index ce244062bf..7e1bedc118 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -4,6 +4,7 @@
 import pandas as pd
 
 from evalml.pipelines.components.transformers import Transformer
+from evalml.pipelines.components.transformers.preprocessing import LSA
 from evalml.utils import import_or_raise
 
 
@@ -34,6 +35,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
                 text_columns[i] = str(col_name)
         self.text_col_names = text_columns
         self._features = None
+        self._lsa = LSA(text_columns=text_columns, random_state=random_state)
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
@@ -83,11 +85,11 @@ def fit(self, X, y=None):
         es.df = self._clean_text(X)
 
         trans = [self._nlp_primitives.DiversityScore,
-                 self._nlp_primitives.LSA,
                  self._nlp_primitives.MeanCharactersPerWord,
                  self._nlp_primitives.PartOfSpeechCount,
                  self._nlp_primitives.PolarityScore]
 
+        self._lsa.fit(X)
         self._features = self._ft.dfs(entityset=es,
                                       target_entity='X',
                                       trans_primitives=trans,
@@ -112,6 +114,8 @@ def transform(self, X, y=None):
         self._verify_col_names(X.columns)
 
         X_text = X[self.text_col_names]
+        X_lsa = self._lsa.transform(X_text)
+
         X_text['index'] = range(len(X_text))
         X_t = X.drop(self.text_col_names, axis=1)
 
@@ -123,5 +127,5 @@ def transform(self, X, y=None):
         feature_matrix = self._ft.calculate_feature_matrix(features=self._features,
                                                            entityset=es,
                                                            verbose=True)
-        X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1)
+        X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1)
         return X_t
diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
index ed24a497bd..a6e18b9c9f 100644
--- a/evalml/tests/component_tests/test_text_featurizer.py
+++ b/evalml/tests/component_tests/test_text_featurizer.py
@@ -182,9 +182,9 @@ def test_lsa_primitive_output():
     tf = TextFeaturizer(text_columns=['lsa'])
     tf.fit(X)
 
-    expected_features = [[0.0200961, 0.002976],
-                         [0.0223392, 0.0058817],
-                         [0.0186072, -0.0006121]]
+    expected_features = [[0.832, 0.],
+                         [0., 1.],
+                         [0.832, 0.]]
     X_t = tf.transform(X)
     cols = [col for col in X_t.columns if 'LSA' in col]
     features = X_t[cols]

From 19c355edafd84250b920a1c62c4f1afb52b5ca63 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Wed, 5 Aug 2020 10:30:53 -0400
Subject: [PATCH 03/10] Standardize LSA transform output

---
 .../components/transformers/preprocessing/lsa.py       | 10 +++++++---
 evalml/tests/component_tests/test_utils.py             |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index a0d2dd0b5e..b8b136fbb9 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -74,12 +74,16 @@ def transform(self, X, y=None):
             pd.DataFrame: Transformed X
         """
         if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X).rename(columns=str)
+            X = pd.DataFrame(X)
         X_t = X
 
         for col in self.text_col_names:
-            transformed = self.lsa_pipeline.transform(X[col])
-            X_t = X_t.drop(labels=col, axis=1)
+            try:
+                transformed = self.lsa_pipeline.transform(X[col])
+                X_t = X_t.drop(labels=col, axis=1)
+            except KeyError:
+                transformed = self.lsa_pipeline.transform(X[int(col)])
+                X_t = X_t.drop(labels=int(col), axis=1)
 
             X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
             X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py
index bf960e5141..5a7294df6b 100644
--- a/evalml/tests/component_tests/test_utils.py
+++ b/evalml/tests/component_tests/test_utils.py
@@ -12,9 +12,9 @@
 
 def test_all_components(has_minimal_dependencies):
     if has_minimal_dependencies:
-        assert len(all_components) == 22
+        assert len(all_components) == 23
     else:
-        assert len(all_components) == 26
+        assert len(all_components) == 27
 
 
 def test_handle_component_class_names():

From 6d1b5c80d00ca36d38056193bc5ff163b0e833a9 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Wed, 5 Aug 2020 10:40:11 -0400
Subject: [PATCH 04/10] Update release notes

---
 docs/source/release_notes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index d30dd078f6..ea582b6402 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -3,7 +3,10 @@ Release Notes
 
 **Future Releases**
     * Enhancements
+        * Added new LSA component for text featurization :pr:`1022`
     * Fixes
+        * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
+        * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
     * Changes
     * Documentation Changes
     * Testing Changes

From c9fe512dd2934d06d452baaa898967597454d8d5 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Wed, 5 Aug 2020 10:53:34 -0400
Subject: [PATCH 05/10] Fix outdated docstring

---
 .../components/transformers/preprocessing/lsa.py         | 9 +++------
 evalml/tests/component_tests/test_lsa.py                 | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index b8b136fbb9..93fba8972a 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -14,14 +14,11 @@ class LSA(Transformer):
     hyperparameter_ranges = {}
 
     def __init__(self, text_columns=None, random_state=0, **kwargs):
-        """Initalizes an transformer to perform TF-IDF transformation and Singular Value Decomposition.
+        """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns.
 
         Arguments:
-            training_corpus(iterable): The collection of documents to fit this component on. Any iterable
-            that yields str or unicode objects can be passed in, the simplest format being a 1-dimensional
-            list, numpy array, or pandas Series. If no document is passed in, the component will be trained
-            on (nltk's brown sentence corpus.)[https://www.nltk.org/book/ch02.html#brown-corpus].
-            random_state(int): A seed for the random state.
+            text_colums (list): list of `pd.DataFrame` column names that contain text.
+            random_state (int, np.random.RandomState): Seed for the random number generator.
         """
         text_columns = text_columns or []
         parameters = {'text_columns': text_columns}
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
index cd39d93fb3..c41ceac330 100644
--- a/evalml/tests/component_tests/test_lsa.py
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -50,7 +50,7 @@ def test_lsa_with_nontext(text_df):
     assert X_t.dtypes.all() == np.float64
 
 
-def test_featurizer_no_text():
+def test_lsa_no_text():
     X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
     warn_msg = "No text columns were given to LSA, component will have no effect"
     with pytest.warns(RuntimeWarning, match=warn_msg):

From 9bf7427e5082f5527188e4a8d0297a34564953e6 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Thu, 6 Aug 2020 09:55:57 -0400
Subject: [PATCH 06/10] Remove runtime warnings from init functions

---
 .../components/transformers/preprocessing/lsa.py          | 3 +--
 .../transformers/preprocessing/text_featurizer.py         | 3 +--
 evalml/tests/component_tests/test_lsa.py                  | 8 ++++----
 evalml/tests/component_tests/test_text_featurizer.py      | 8 ++++----
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index 93fba8972a..eefb4a1c26 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -24,8 +24,6 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         parameters = {'text_columns': text_columns}
         parameters.update(kwargs)
 
-        if len(text_columns) == 0:
-            warnings.warn("No text columns were given to LSA, component will have no effect", RuntimeWarning)
         for i, col_name in enumerate(text_columns):
             if not isinstance(col_name, str):
                 text_columns[i] = str(col_name)
@@ -50,6 +48,7 @@ def _verify_col_names(self, col_names):
 
     def fit(self, X, y=None):
         if len(self.text_col_names) == 0:
+            warnings.warn("No text columns were given to LSA, component has no effect", RuntimeWarning)
             return self
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X).rename(columns=str)
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index 7e1bedc118..7663053509 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -28,8 +28,6 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         parameters = {'text_columns': text_columns}
         parameters.update(kwargs)
 
-        if len(text_columns) == 0:
-            warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning)
         for i, col_name in enumerate(text_columns):
             if not isinstance(col_name, str):
                 text_columns[i] = str(col_name)
@@ -71,6 +69,7 @@ def _verify_col_types(self, entity_set):
 
     def fit(self, X, y=None):
         if len(self.text_col_names) == 0:
+            warnings.warn("No text columns were given to TextFeaturizer, component has no effect", RuntimeWarning)
             self._features = []
             return self
         if not isinstance(X, pd.DataFrame):
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
index c41ceac330..c6aaec9c7d 100644
--- a/evalml/tests/component_tests/test_lsa.py
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -52,11 +52,11 @@ def test_lsa_with_nontext(text_df):
 
 def test_lsa_no_text():
     X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
-    warn_msg = "No text columns were given to LSA, component will have no effect"
-    with pytest.warns(RuntimeWarning, match=warn_msg):
-        lsa = LSA()
+    warn_msg = "No text columns were given to LSA, component has no effect"
+    lsa = LSA()
 
-    lsa.fit(X)
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        lsa.fit(X)
     X_t = lsa.transform(X)
     assert len(X_t.columns) == 2
 
diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
index a6e18b9c9f..0aee8a5a8f 100644
--- a/evalml/tests/component_tests/test_text_featurizer.py
+++ b/evalml/tests/component_tests/test_text_featurizer.py
@@ -73,11 +73,11 @@ def test_featurizer_with_nontext(text_df):
 
 def test_featurizer_no_text():
     X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
-    warn_msg = "No text columns were given to TextFeaturizer, component will have no effect"
-    with pytest.warns(RuntimeWarning, match=warn_msg):
-        tf = TextFeaturizer()
+    warn_msg = "No text columns were given to TextFeaturizer, component has no effect"
+    tf = TextFeaturizer()
 
-    tf.fit(X)
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        tf.fit(X)
     X_t = tf.transform(X)
     assert len(X_t.columns) == 2
 

From cb5617ae97135c1b59a1f81595821fae3297a1d7 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Thu, 6 Aug 2020 15:34:04 -0400
Subject: [PATCH 07/10] Clean up unnecessary code

---
 .../transformers/preprocessing/lsa.py         | 20 +++++++------------
 .../preprocessing/text_featurizer.py          | 12 +++--------
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index eefb4a1c26..3116c6d211 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -20,24 +20,18 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
             text_colums (list): list of `pd.DataFrame` column names that contain text.
             random_state (int, np.random.RandomState): Seed for the random number generator.
         """
-        text_columns = text_columns or []
         parameters = {'text_columns': text_columns}
+        text_columns = text_columns or []
         parameters.update(kwargs)
 
-        for i, col_name in enumerate(text_columns):
-            if not isinstance(col_name, str):
-                text_columns[i] = str(col_name)
-        self.text_col_names = text_columns
-        self.lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
+        self.text_col_names = [str(col_name) for col_name in text_columns]
+        self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
 
     def _verify_col_names(self, col_names):
-        missing_cols = []
-        for col in self.text_col_names:
-            if col not in col_names:
-                missing_cols.append(col)
+        missing_cols = [col for col in self.text_col_names if col not in col_names]
 
         if len(missing_cols) > 0:
             if len(missing_cols) == len(self.text_col_names):
@@ -58,7 +52,7 @@ def fit(self, X, y=None):
         for col in self.text_col_names:
             corpus.extend(X[col].values.tolist())
 
-        self.lsa_pipeline.fit(corpus)
+        self._lsa_pipeline.fit(corpus)
         return self
 
     def transform(self, X, y=None):
@@ -75,10 +69,10 @@ def transform(self, X, y=None):
 
         for col in self.text_col_names:
             try:
-                transformed = self.lsa_pipeline.transform(X[col])
+                transformed = self._lsa_pipeline.transform(X[col])
                 X_t = X_t.drop(labels=col, axis=1)
             except KeyError:
-                transformed = self.lsa_pipeline.transform(X[int(col)])
+                transformed = self._lsa_pipeline.transform(X[int(col)])
                 X_t = X_t.drop(labels=int(col), axis=1)
 
             X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index 7663053509..639995fb8f 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -24,14 +24,11 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         self._ft = import_or_raise("featuretools", error_msg="Package featuretools is not installed. Please install using `pip install featuretools[nlp_primitives].`")
         self._nlp_primitives = import_or_raise("nlp_primitives", error_msg="Package nlp_primitives is not installed. Please install using `pip install featuretools[nlp_primitives].`")
 
-        text_columns = text_columns or []
         parameters = {'text_columns': text_columns}
+        text_columns = text_columns or []
         parameters.update(kwargs)
 
-        for i, col_name in enumerate(text_columns):
-            if not isinstance(col_name, str):
-                text_columns[i] = str(col_name)
-        self.text_col_names = text_columns
+        self.text_col_names = [str(col_name) for col_name in text_columns]
         self._features = None
         self._lsa = LSA(text_columns=text_columns, random_state=random_state)
         super().__init__(parameters=parameters,
@@ -49,10 +46,7 @@ def normalize(text):
         return X
 
     def _verify_col_names(self, col_names):
-        missing_cols = []
-        for col in self.text_col_names:
-            if col not in col_names:
-                missing_cols.append(col)
+        missing_cols = [col for col in self.text_col_names if col not in col_names]
 
         if len(missing_cols) > 0:
             if len(missing_cols) == len(self.text_col_names):

From a1d9a98b76158836f4321b26a26332a1622cd9be Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Fri, 7 Aug 2020 13:45:25 -0400
Subject: [PATCH 08/10] Address PR comments

---
 .../transformers/preprocessing/lsa.py         | 20 +++++++---------
 .../preprocessing/text_featurizer.py          | 23 +++++++++----------
 evalml/tests/component_tests/test_lsa.py      |  5 +---
 .../component_tests/test_text_featurizer.py   |  5 +---
 4 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index 3116c6d211..8dba3fca86 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -17,41 +17,37 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns.
 
         Arguments:
-            text_colums (list): list of `pd.DataFrame` column names that contain text.
+            text_columns (list): list of feature names which should be treated as text features.
             random_state (int, np.random.RandomState): Seed for the random number generator.
         """
         parameters = {'text_columns': text_columns}
         text_columns = text_columns or []
         parameters.update(kwargs)
 
-        self.text_col_names = [str(col_name) for col_name in text_columns]
+        self._text_col_names = [str(col_name) for col_name in text_columns]
         self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
 
     def _verify_col_names(self, col_names):
-        missing_cols = [col for col in self.text_col_names if col not in col_names]
+        missing_cols = [col for col in self._text_col_names if col not in col_names]
 
         if len(missing_cols) > 0:
-            if len(missing_cols) == len(self.text_col_names):
+            if len(missing_cols) == len(self._text_col_names):
                 raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
             for col in missing_cols:
-                self.text_col_names.remove(col)
+                self._text_col_names.remove(col)
             warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
 
     def fit(self, X, y=None):
-        if len(self.text_col_names) == 0:
-            warnings.warn("No text columns were given to LSA, component has no effect", RuntimeWarning)
+        if len(self._text_col_names) == 0:
             return self
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X).rename(columns=str)
         self._verify_col_names(X.columns)
 
-        corpus = []
-        for col in self.text_col_names:
-            corpus.extend(X[col].values.tolist())
-
+        corpus = X[self._text_col_names].values.flatten()
         self._lsa_pipeline.fit(corpus)
         return self
 
@@ -67,7 +63,7 @@ def transform(self, X, y=None):
             X = pd.DataFrame(X)
         X_t = X
 
-        for col in self.text_col_names:
+        for col in self._text_col_names:
             try:
                 transformed = self._lsa_pipeline.transform(X[col])
                 X_t = X_t.drop(labels=col, axis=1)
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index 639995fb8f..e4dcccb92a 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -17,7 +17,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         """Extracts features from text columns using featuretools' nlp_primitives
 
         Arguments:
-            text_colums (list): list of `pd.DataFrame` column names that contain text.
+            text_columns (list): list of feature names which should be treated as text features.
             random_state (int, np.random.RandomState): Seed for the random number generator.
 
         """
@@ -28,7 +28,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         text_columns = text_columns or []
         parameters.update(kwargs)
 
-        self.text_col_names = [str(col_name) for col_name in text_columns]
+        self._text_col_names = [str(col_name) for col_name in text_columns]
         self._features = None
         self._lsa = LSA(text_columns=text_columns, random_state=random_state)
         super().__init__(parameters=parameters,
@@ -41,35 +41,34 @@ def normalize(text):
             text = text.translate(str.maketrans('', '', string.punctuation))
             return text.lower()
 
-        for text_col in self.text_col_names:
+        for text_col in self._text_col_names:
             X[text_col] = X[text_col].apply(normalize)
         return X
 
     def _verify_col_names(self, col_names):
-        missing_cols = [col for col in self.text_col_names if col not in col_names]
+        missing_cols = [col for col in self._text_col_names if col not in col_names]
 
         if len(missing_cols) > 0:
-            if len(missing_cols) == len(self.text_col_names):
+            if len(missing_cols) == len(self._text_col_names):
                 raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
             for col in missing_cols:
-                self.text_col_names.remove(col)
+                self._text_col_names.remove(col)
             warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
 
     def _verify_col_types(self, entity_set):
         var_types = entity_set.entities[0].variable_types
-        for col in self.text_col_names:
+        for col in self._text_col_names:
             if var_types[col] is not self._ft.variable_types.variable.Text:
                 raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col))
 
     def fit(self, X, y=None):
-        if len(self.text_col_names) == 0:
-            warnings.warn("No text columns were given to TextFeaturizer, component has no effect", RuntimeWarning)
+        if len(self._text_col_names) == 0:
             self._features = []
             return self
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X).rename(columns=str)
         self._verify_col_names(X.columns)
-        X_text = X[self.text_col_names]
+        X_text = X[self._text_col_names]
         X_text['index'] = range(len(X_text))
 
         es = self._ft.EntitySet()
@@ -106,11 +105,11 @@ def transform(self, X, y=None):
         X = X.rename(columns=str)
         self._verify_col_names(X.columns)
 
-        X_text = X[self.text_col_names]
+        X_text = X[self._text_col_names]
         X_lsa = self._lsa.transform(X_text)
 
         X_text['index'] = range(len(X_text))
-        X_t = X.drop(self.text_col_names, axis=1)
+        X_t = X.drop(self._text_col_names, axis=1)
 
         es = self._ft.EntitySet()
         es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
index c6aaec9c7d..1540bc31e0 100644
--- a/evalml/tests/component_tests/test_lsa.py
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -52,11 +52,8 @@ def test_lsa_with_nontext(text_df):
 
 def test_lsa_no_text():
     X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
-    warn_msg = "No text columns were given to LSA, component has no effect"
     lsa = LSA()
-
-    with pytest.warns(RuntimeWarning, match=warn_msg):
-        lsa.fit(X)
+    lsa.fit(X)
     X_t = lsa.transform(X)
     assert len(X_t.columns) == 2
 
diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
index 0aee8a5a8f..68f8728646 100644
--- a/evalml/tests/component_tests/test_text_featurizer.py
+++ b/evalml/tests/component_tests/test_text_featurizer.py
@@ -73,11 +73,8 @@ def test_featurizer_with_nontext(text_df):
 
 def test_featurizer_no_text():
     X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
-    warn_msg = "No text columns were given to TextFeaturizer, component has no effect"
     tf = TextFeaturizer()
-
-    with pytest.warns(RuntimeWarning, match=warn_msg):
-        tf.fit(X)
+    tf.fit(X)
     X_t = tf.transform(X)
     assert len(X_t.columns) == 2
 

From e383731644328d16d24048327bd18c1fdc1d2785 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Mon, 10 Aug 2020 10:36:19 -0400
Subject: [PATCH 09/10] Raise warnings using logger instead of warnings package

---
 .../components/transformers/preprocessing/lsa.py         | 9 ++++++---
 .../transformers/preprocessing/text_featurizer.py        | 6 ++++--
 evalml/tests/component_tests/test_lsa.py                 | 7 +++++--
 evalml/tests/component_tests/test_text_featurizer.py     | 7 +++++--
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index 8dba3fca86..028f615671 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -1,4 +1,4 @@
-import warnings
+import logging
 
 import pandas as pd
 from sklearn.decomposition import TruncatedSVD
@@ -7,6 +7,8 @@
 
 from evalml.pipelines.components.transformers import Transformer
 
+logger = logging.getLogger()
+
 
 class LSA(Transformer):
     """Transformer to calculate the Latent Semantic Analysis Values of text input"""
@@ -38,7 +40,7 @@ def _verify_col_names(self, col_names):
                 raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
             for col in missing_cols:
                 self._text_col_names.remove(col)
-            warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
+            logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols))
 
     def fit(self, X, y=None):
         if len(self._text_col_names) == 0:
@@ -57,7 +59,8 @@ def transform(self, X, y=None):
             X (pd.DataFrame): Data to transform
             y (pd.Series, optional): Targets
         Returns:
-            pd.DataFrame: Transformed X
+            pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the
+                          format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
         """
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index e4dcccb92a..22b7f7cc9a 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -1,5 +1,5 @@
+import logging
 import string
-import warnings
 
 import pandas as pd
 
@@ -7,6 +7,8 @@
 from evalml.pipelines.components.transformers.preprocessing import LSA
 from evalml.utils import import_or_raise
 
+logger = logging.getLogger()
+
 
 class TextFeaturizer(Transformer):
     """Transformer that can automatically featurize text columns."""
@@ -53,7 +55,7 @@ def _verify_col_names(self, col_names):
                 raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
             for col in missing_cols:
                 self._text_col_names.remove(col)
-            warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
+            logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols))
 
     def _verify_col_types(self, entity_set):
         var_types = entity_set.entities[0].variable_types
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
index 1540bc31e0..927123c376 100644
--- a/evalml/tests/component_tests/test_lsa.py
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -1,3 +1,5 @@
+import logging
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -58,12 +60,13 @@ def test_lsa_no_text():
     assert len(X_t.columns) == 2
 
 
-def test_some_missing_col_names(text_df):
+def test_some_missing_col_names(text_df, caplog):
     X = text_df
     lsa = LSA(text_columns=['col_1', 'col_2', 'col_3'])
 
-    with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"):
+    with caplog.at_level(logging.WARNING):
         lsa.fit(X)
+    assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages
 
     expected_col_names = set(['LSA(col_1)[0]',
                               'LSA(col_1)[1]',
diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
index 68f8728646..5cf50f00ef 100644
--- a/evalml/tests/component_tests/test_text_featurizer.py
+++ b/evalml/tests/component_tests/test_text_featurizer.py
@@ -1,3 +1,5 @@
+import logging
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -79,12 +81,13 @@ def test_featurizer_no_text():
     assert len(X_t.columns) == 2
 
 
-def test_some_missing_col_names(text_df):
+def test_some_missing_col_names(text_df, caplog):
     X = text_df
     tf = TextFeaturizer(text_columns=['col_1', 'col_2', 'col_3'])
 
-    with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"):
+    with caplog.at_level(logging.WARNING):
         tf.fit(X)
+    assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages
 
     expected_col_names = set(['DIVERSITY_SCORE(col_1)',
                               'DIVERSITY_SCORE(col_2)',

From 0ad6b0ae8894d69f9119d653f907c08ebf73564d Mon Sep 17 00:00:00 2001
From: Becca McBrayer <rebecca.mcbrayer@gmail.com>
Date: Mon, 10 Aug 2020 16:07:38 -0400
Subject: [PATCH 10/10] PR comments

---
 .../transformers/preprocessing/lsa.py         | 12 +++----
 .../preprocessing/text_featurizer.py          | 11 +++---
 evalml/tests/component_tests/test_lsa.py      | 35 +++++++++++++++++++
 .../component_tests/test_text_featurizer.py   | 30 ++++++++++++++++
 4 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py
index 028f615671..1c4df0c215 100644
--- a/evalml/pipelines/components/transformers/preprocessing/lsa.py
+++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py
@@ -26,7 +26,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         text_columns = text_columns or []
         parameters.update(kwargs)
 
-        self._text_col_names = [str(col_name) for col_name in text_columns]
+        self._text_col_names = text_columns
         self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
         super().__init__(parameters=parameters,
                          component_obj=None,
@@ -46,7 +46,7 @@ def fit(self, X, y=None):
         if len(self._text_col_names) == 0:
             return self
         if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X).rename(columns=str)
+            X = pd.DataFrame(X)
         self._verify_col_names(X.columns)
 
         corpus = X[self._text_col_names].values.flatten()
@@ -67,13 +67,9 @@ def transform(self, X, y=None):
         X_t = X
 
         for col in self._text_col_names:
-            try:
-                transformed = self._lsa_pipeline.transform(X[col])
-                X_t = X_t.drop(labels=col, axis=1)
-            except KeyError:
-                transformed = self._lsa_pipeline.transform(X[int(col)])
-                X_t = X_t.drop(labels=int(col), axis=1)
+            transformed = self._lsa_pipeline.transform(X[col])
 
             X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
             X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
+        X_t = X_t.drop(columns=self._text_col_names)
         return X_t
diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
index 22b7f7cc9a..4fa2c7e572 100644
--- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
+++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py
@@ -30,9 +30,9 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
         text_columns = text_columns or []
         parameters.update(kwargs)
 
-        self._text_col_names = [str(col_name) for col_name in text_columns]
         self._features = None
         self._lsa = LSA(text_columns=text_columns, random_state=random_state)
+        self._text_col_names = text_columns
         super().__init__(parameters=parameters,
                          component_obj=None,
                          random_state=random_state)
@@ -60,7 +60,7 @@ def _verify_col_names(self, col_names):
     def _verify_col_types(self, entity_set):
         var_types = entity_set.entities[0].variable_types
         for col in self._text_col_names:
-            if var_types[col] is not self._ft.variable_types.variable.Text:
+            if var_types[str(col)] is not self._ft.variable_types.variable.Text:
                 raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col))
 
     def fit(self, X, y=None):
@@ -68,13 +68,13 @@ def fit(self, X, y=None):
             self._features = []
             return self
         if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X).rename(columns=str)
+            X = pd.DataFrame(X)
         self._verify_col_names(X.columns)
         X_text = X[self._text_col_names]
         X_text['index'] = range(len(X_text))
 
         es = self._ft.EntitySet()
-        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
+        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index')
         self._verify_col_types(es)
         es.df = self._clean_text(X)
 
@@ -104,7 +104,6 @@ def transform(self, X, y=None):
             X = pd.DataFrame(X)
         if self._features is None or len(self._features) == 0:
             return X
-        X = X.rename(columns=str)
         self._verify_col_names(X.columns)
 
         X_text = X[self._text_col_names]
@@ -114,7 +113,7 @@ def transform(self, X, y=None):
         X_t = X.drop(self._text_col_names, axis=1)
 
         es = self._ft.EntitySet()
-        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
+        es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index')
         self._verify_col_types(es)
         es.df = self._clean_text(X)
 
diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
index 927123c376..afde0f0566 100644
--- a/evalml/tests/component_tests/test_lsa.py
+++ b/evalml/tests/component_tests/test_lsa.py
@@ -111,6 +111,41 @@ def test_index_col_names():
     assert X_t.dtypes.all() == np.float64
 
 
+def test_int_col_names():
+    X = pd.DataFrame(
+        {4.75: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
+                'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
+                'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'],
+         -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
+              'I dreamed a dream in days gone by, when hope was high and life worth living',
+              'Red, the blood of angry men - black, the dark of ages past']
+         })
+    lsa = LSA(text_columns=[4.75, -1])
+    lsa.fit(X)
+    expected_col_names = set(['LSA(4.75)[0]',
+                              'LSA(4.75)[1]',
+                              'LSA(-1)[0]',
+                              'LSA(-1)[1]'])
+    X_t = lsa.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 4
+    assert X_t.dtypes.all() == np.float64
+
+
+def test_repeat_col_names():
+    X = pd.DataFrame(data=np.array([['identical string one', 'identical string one'],
+                                    ['second double string', 'second double string'],
+                                    ['copy the third', 'copy the third']]), columns=['col_1', 'col_1'])
+    lsa = LSA(text_columns=['col_1', 'col_1'])
+    lsa.fit(X)
+    expected_col_names = ['LSA(col_1)[0]',
+                          'LSA(col_1)[1]']
+    X_t = lsa.transform(X)
+    np.testing.assert_array_equal(X_t.columns, np.array(expected_col_names))
+    assert len(X_t.columns) == 2
+    assert X_t.dtypes.all() == np.float64
+
+
 def test_lsa_output():
     X = pd.DataFrame(
         {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
index 5cf50f00ef..830dd89302 100644
--- a/evalml/tests/component_tests/test_text_featurizer.py
+++ b/evalml/tests/component_tests/test_text_featurizer.py
@@ -160,6 +160,36 @@ def test_index_col_names():
     assert X_t.dtypes.all() == np.float64
 
 
+def test_int_col_names():
+    X = pd.DataFrame(
+        {475: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
+               'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
+               'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'],
+         -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
+              'I dreamed a dream in days gone by, when hope was high and life worth living',
+              'Red, the blood of angry men - black, the dark of ages past']
+         })
+    tf = TextFeaturizer(text_columns=[475, -1])
+    tf.fit(X)
+    expected_col_names = set(['DIVERSITY_SCORE(475)',
+                              'DIVERSITY_SCORE(-1)',
+                              'LSA(475)[0]',
+                              'LSA(475)[1]',
+                              'LSA(-1)[0]',
+                              'LSA(-1)[1]',
+                              'MEAN_CHARACTERS_PER_WORD(475)',
+                              'MEAN_CHARACTERS_PER_WORD(-1)',
+                              'POLARITY_SCORE(475)',
+                              'POLARITY_SCORE(-1)'])
+    for i in range(15):
+        expected_col_names.add(f'PART_OF_SPEECH_COUNT(475)[{i}]')
+        expected_col_names.add(f'PART_OF_SPEECH_COUNT(-1)[{i}]')
+    X_t = tf.transform(X)
+    assert set(X_t.columns) == expected_col_names
+    assert len(X_t.columns) == 40
+    assert X_t.dtypes.all() == np.float64
+
+
 def test_diversity_primitive_output():
     X = pd.DataFrame(
         {'diverse': ['This is a very diverse string which does not contain any repeated words at all',