From 0ba5627109d00d29549b01de430a0783a832efdc Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Tue, 4 Aug 2020 16:04:58 -0400 Subject: [PATCH 01/10] Add LSA component --- evalml/pipelines/components/__init__.py | 3 +- .../components/transformers/__init__.py | 2 +- .../transformers/preprocessing/__init__.py | 1 + .../transformers/preprocessing/lsa.py | 86 ++++++++++++ evalml/tests/component_tests/test_lsa.py | 128 ++++++++++++++++++ 5 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 evalml/pipelines/components/transformers/preprocessing/lsa.py create mode 100644 evalml/tests/component_tests/test_lsa.py diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index edf5e14a13..79f627cfa2 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -32,5 +32,6 @@ DropNullColumns, DateTimeFeaturizer, SelectColumns, - TextFeaturizer + TextFeaturizer, + LSA ) diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py index 81f20cb123..7fa863f738 100644 --- a/evalml/pipelines/components/transformers/__init__.py +++ b/evalml/pipelines/components/transformers/__init__.py @@ -5,4 +5,4 @@ from .imputers import PerColumnImputer, SimpleImputer, Imputer from .scalers import StandardScaler from .column_selectors import DropColumns, SelectColumns -from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer +from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer diff --git a/evalml/pipelines/components/transformers/preprocessing/__init__.py b/evalml/pipelines/components/transformers/preprocessing/__init__.py index a680656d10..2ad5a2f9dc 100644 --- a/evalml/pipelines/components/transformers/preprocessing/__init__.py +++ b/evalml/pipelines/components/transformers/preprocessing/__init__.py @@ -1,4 +1,5 @@ # flake8:noqa from .datetime_featurizer import DateTimeFeaturizer from .drop_null_columns import DropNullColumns +from .lsa import LSA from .text_featurizer import TextFeaturizer diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py new file mode 100644 index 0000000000..a0d2dd0b5e --- /dev/null +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -0,0 +1,86 @@ +import warnings + +import pandas as pd +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import make_pipeline + +from evalml.pipelines.components.transformers import Transformer + + +class LSA(Transformer): + """Transformer to calculate the Latent Semantic Analysis Values of text input""" + name = "LSA Transformer" + hyperparameter_ranges = {} + + def __init__(self, text_columns=None, random_state=0, **kwargs): + """Initalizes an transformer to perform TF-IDF transformation and Singular Value Decomposition. + + Arguments: + training_corpus(iterable): The collection of documents to fit this component on. Any iterable + that yields str or unicode objects can be passed in, the simplest format being a 1-dimensional + list, numpy array, or pandas Series. If no document is passed in, the component will be trained + on (nltk's brown sentence corpus.)[https://www.nltk.org/book/ch02.html#brown-corpus]. + random_state(int): A seed for the random state. + """ + text_columns = text_columns or [] + parameters = {'text_columns': text_columns} + parameters.update(kwargs) + + if len(text_columns) == 0: + warnings.warn("No text columns were given to LSA, component will have no effect", RuntimeWarning) + for i, col_name in enumerate(text_columns): + if not isinstance(col_name, str): + text_columns[i] = str(col_name) + self.text_col_names = text_columns + self.lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state)) + super().__init__(parameters=parameters, + component_obj=None, + random_state=random_state) + + def _verify_col_names(self, col_names): + missing_cols = [] + for col in self.text_col_names: + if col not in col_names: + missing_cols.append(col) + + if len(missing_cols) > 0: + if len(missing_cols) == len(self.text_col_names): + raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") + for col in missing_cols: + self.text_col_names.remove(col) + warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) + + def fit(self, X, y=None): + if len(self.text_col_names) == 0: + return self + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X).rename(columns=str) + self._verify_col_names(X.columns) + + corpus = [] + for col in self.text_col_names: + corpus.extend(X[col].values.tolist()) + + self.lsa_pipeline.fit(corpus) + return self + + def transform(self, X, y=None): + """Transforms data X by applying the LSA pipeline. + Arguments: + X (pd.DataFrame): Data to transform + y (pd.Series, optional): Targets + Returns: + pd.DataFrame: Transformed X + """ + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X).rename(columns=str) + X_t = X + + for col in self.text_col_names: + transformed = self.lsa_pipeline.transform(X[col]) + X_t = X_t.drop(labels=col, axis=1) + + X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0]) + X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1]) + return X_t diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py new file mode 100644 index 0000000000..cd39d93fb3 --- /dev/null +++ b/evalml/tests/component_tests/test_lsa.py @@ -0,0 +1,128 @@ +import numpy as np +import pandas as pd +import pytest + +from evalml.pipelines.components import LSA + + +@pytest.fixture() +def text_df(): + df = pd.DataFrame( + {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', + 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', + 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], + 'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past'] + }) + yield df + + +def test_lsa_only_text(text_df): + X = text_df + lsa = LSA(text_columns=['col_1', 'col_2']) + lsa.fit(X) + + expected_col_names = set(['LSA(col_1)[0]', + 'LSA(col_1)[1]', + 'LSA(col_2)[0]', + 'LSA(col_2)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_lsa_with_nontext(text_df): + X = text_df + X['col_3'] = [73.7, 67.213, 92] + lsa = LSA(text_columns=['col_1', 'col_2']) + + lsa.fit(X) + expected_col_names = set(['LSA(col_1)[0]', + 'LSA(col_1)[1]', + 'LSA(col_2)[0]', + 'LSA(col_2)[1]', + 'col_3']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 5 + assert X_t.dtypes.all() == np.float64 + + +def test_featurizer_no_text(): + X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) + warn_msg = "No text columns were given to LSA, component will have no effect" + with pytest.warns(RuntimeWarning, match=warn_msg): + lsa = LSA() + + lsa.fit(X) + X_t = lsa.transform(X) + assert len(X_t.columns) == 2 + + +def test_some_missing_col_names(text_df): + X = text_df + lsa = LSA(text_columns=['col_1', 'col_2', 'col_3']) + + with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"): + lsa.fit(X) + + expected_col_names = set(['LSA(col_1)[0]', + 'LSA(col_1)[1]', + 'LSA(col_2)[0]', + 'LSA(col_2)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_all_missing_col_names(text_df): + X = text_df + lsa = LSA(text_columns=['col_3', 'col_4']) + + error_msg = "None of the provided text column names match the columns in the given DataFrame" + with pytest.raises(RuntimeError, match=error_msg): + lsa.fit(X) + + +def test_empty_text_column(): + X = pd.DataFrame({'col_1': []}) + lsa = LSA(text_columns=['col_1']) + with pytest.raises(ValueError, match="empty vocabulary"): + lsa.fit(X) + + +def test_index_col_names(): + X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'], + ['just singing in the rain.................. \n', 'singing the songs of angry men\n'], + ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']]) + lsa = LSA(text_columns=[0, 1]) + + lsa.fit(X) + expected_col_names = set(['LSA(0)[0]', + 'LSA(0)[1]', + 'LSA(1)[0]', + 'LSA(1)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_lsa_output(): + X = pd.DataFrame( + {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past']}) + lsa = LSA(text_columns=['lsa']) + lsa.fit(X) + + expected_features = [[0.832, 0.], + [0., 1.], + [0.832, 0.]] + X_t = lsa.transform(X) + cols = [col for col in X_t.columns if 'LSA' in col] + features = X_t[cols] + np.testing.assert_almost_equal(features, expected_features, decimal=3) From da832057cd5fdf0b4bd85fcc0420dfb12559f479 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 5 Aug 2020 08:36:43 -0400 Subject: [PATCH 02/10] Integrate LSA component into TextFeaturizer --- evalml/pipelines/components/__init__.py | 2 +- .../transformers/preprocessing/text_featurizer.py | 8 ++++++-- evalml/tests/component_tests/test_text_featurizer.py | 6 +++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index 79f627cfa2..b42de8ebe6 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -33,5 +33,5 @@ DateTimeFeaturizer, SelectColumns, TextFeaturizer, - LSA + LSA, ) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index ce244062bf..7e1bedc118 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -4,6 +4,7 @@ import pandas as pd from evalml.pipelines.components.transformers import Transformer +from evalml.pipelines.components.transformers.preprocessing import LSA from evalml.utils import import_or_raise @@ -34,6 +35,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): text_columns[i] = str(col_name) self.text_col_names = text_columns self._features = None + self._lsa = LSA(text_columns=text_columns, random_state=random_state) super().__init__(parameters=parameters, component_obj=None, random_state=random_state) @@ -83,11 +85,11 @@ def fit(self, X, y=None): es.df = self._clean_text(X) trans = [self._nlp_primitives.DiversityScore, - self._nlp_primitives.LSA, self._nlp_primitives.MeanCharactersPerWord, self._nlp_primitives.PartOfSpeechCount, self._nlp_primitives.PolarityScore] + self._lsa.fit(X) self._features = self._ft.dfs(entityset=es, target_entity='X', trans_primitives=trans, @@ -112,6 +114,8 @@ def transform(self, X, y=None): self._verify_col_names(X.columns) X_text = X[self.text_col_names] + X_lsa = self._lsa.transform(X_text) + X_text['index'] = range(len(X_text)) X_t = X.drop(self.text_col_names, axis=1) @@ -123,5 +127,5 @@ def transform(self, X, y=None): feature_matrix = self._ft.calculate_feature_matrix(features=self._features, entityset=es, verbose=True) - X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1) + X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1) return X_t diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index ed24a497bd..a6e18b9c9f 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -182,9 +182,9 @@ def test_lsa_primitive_output(): tf = TextFeaturizer(text_columns=['lsa']) tf.fit(X) - expected_features = [[0.0200961, 0.002976], - [0.0223392, 0.0058817], - [0.0186072, -0.0006121]] + expected_features = [[0.832, 0.], + [0., 1.], + [0.832, 0.]] X_t = tf.transform(X) cols = [col for col in X_t.columns if 'LSA' in col] features = X_t[cols] From 19c355edafd84250b920a1c62c4f1afb52b5ca63 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 5 Aug 2020 10:30:53 -0400 Subject: [PATCH 03/10] Standardize LSA transform output --- .../components/transformers/preprocessing/lsa.py | 10 +++++++--- evalml/tests/component_tests/test_utils.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index a0d2dd0b5e..b8b136fbb9 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -74,12 +74,16 @@ def transform(self, X, y=None): pd.DataFrame: Transformed X """ if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X).rename(columns=str) + X = pd.DataFrame(X) X_t = X for col in self.text_col_names: - transformed = self.lsa_pipeline.transform(X[col]) - X_t = X_t.drop(labels=col, axis=1) + try: + transformed = self.lsa_pipeline.transform(X[col]) + X_t = X_t.drop(labels=col, axis=1) + except KeyError: + transformed = self.lsa_pipeline.transform(X[int(col)]) + X_t = X_t.drop(labels=int(col), axis=1) X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0]) X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1]) diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index bf960e5141..5a7294df6b 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -12,9 +12,9 @@ def test_all_components(has_minimal_dependencies): if has_minimal_dependencies: - assert len(all_components) == 22 + assert len(all_components) == 23 else: - assert len(all_components) == 26 + assert len(all_components) == 27 def test_handle_component_class_names(): From 6d1b5c80d00ca36d38056193bc5ff163b0e833a9 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 5 Aug 2020 10:40:11 -0400 Subject: [PATCH 04/10] Update release notes --- docs/source/release_notes.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index d30dd078f6..ea582b6402 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -3,7 +3,10 @@ Release Notes **Future Releases** * Enhancements + * Added new LSA component for text featurization :pr:`1022` * Fixes + * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022` + * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022` * Changes * Documentation Changes * Testing Changes From c9fe512dd2934d06d452baaa898967597454d8d5 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Wed, 5 Aug 2020 10:53:34 -0400 Subject: [PATCH 05/10] Fix outdated docstring --- .../components/transformers/preprocessing/lsa.py | 9 +++------ evalml/tests/component_tests/test_lsa.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index b8b136fbb9..93fba8972a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -14,14 +14,11 @@ class LSA(Transformer): hyperparameter_ranges = {} def __init__(self, text_columns=None, random_state=0, **kwargs): - """Initalizes an transformer to perform TF-IDF transformation and Singular Value Decomposition. + """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns. Arguments: - training_corpus(iterable): The collection of documents to fit this component on. Any iterable - that yields str or unicode objects can be passed in, the simplest format being a 1-dimensional - list, numpy array, or pandas Series. If no document is passed in, the component will be trained - on (nltk's brown sentence corpus.)[https://www.nltk.org/book/ch02.html#brown-corpus]. - random_state(int): A seed for the random state. + text_colums (list): list of `pd.DataFrame` column names that contain text. + random_state (int, np.random.RandomState): Seed for the random number generator. """ text_columns = text_columns or [] parameters = {'text_columns': text_columns} diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index cd39d93fb3..c41ceac330 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -50,7 +50,7 @@ def test_lsa_with_nontext(text_df): assert X_t.dtypes.all() == np.float64 -def test_featurizer_no_text(): +def test_lsa_no_text(): X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) warn_msg = "No text columns were given to LSA, component will have no effect" with pytest.warns(RuntimeWarning, match=warn_msg): From 9bf7427e5082f5527188e4a8d0297a34564953e6 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Thu, 6 Aug 2020 09:55:57 -0400 Subject: [PATCH 06/10] Remove runtime warnings from init functions --- .../components/transformers/preprocessing/lsa.py | 3 +-- .../transformers/preprocessing/text_featurizer.py | 3 +-- evalml/tests/component_tests/test_lsa.py | 8 ++++---- evalml/tests/component_tests/test_text_featurizer.py | 8 ++++---- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 93fba8972a..eefb4a1c26 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -24,8 +24,6 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): parameters = {'text_columns': text_columns} parameters.update(kwargs) - if len(text_columns) == 0: - warnings.warn("No text columns were given to LSA, component will have no effect", RuntimeWarning) for i, col_name in enumerate(text_columns): if not isinstance(col_name, str): text_columns[i] = str(col_name) @@ -50,6 +48,7 @@ def _verify_col_names(self, col_names): def fit(self, X, y=None): if len(self.text_col_names) == 0: + warnings.warn("No text columns were given to LSA, component has no effect", RuntimeWarning) return self if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X).rename(columns=str) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 7e1bedc118..7663053509 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -28,8 +28,6 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): parameters = {'text_columns': text_columns} parameters.update(kwargs) - if len(text_columns) == 0: - warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning) for i, col_name in enumerate(text_columns): if not isinstance(col_name, str): text_columns[i] = str(col_name) @@ -71,6 +69,7 @@ def _verify_col_types(self, entity_set): def fit(self, X, y=None): if len(self.text_col_names) == 0: + warnings.warn("No text columns were given to TextFeaturizer, component has no effect", RuntimeWarning) self._features = [] return self if not isinstance(X, pd.DataFrame): diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index c41ceac330..c6aaec9c7d 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -52,11 +52,11 @@ def test_lsa_with_nontext(text_df): def test_lsa_no_text(): X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) - warn_msg = "No text columns were given to LSA, component will have no effect" - with pytest.warns(RuntimeWarning, match=warn_msg): - lsa = LSA() + warn_msg = "No text columns were given to LSA, component has no effect" + lsa = LSA() - lsa.fit(X) + with pytest.warns(RuntimeWarning, match=warn_msg): + lsa.fit(X) X_t = lsa.transform(X) assert len(X_t.columns) == 2 diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index a6e18b9c9f..0aee8a5a8f 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -73,11 +73,11 @@ def test_featurizer_with_nontext(text_df): def test_featurizer_no_text(): X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) - warn_msg = "No text columns were given to TextFeaturizer, component will have no effect" - with pytest.warns(RuntimeWarning, match=warn_msg): - tf = TextFeaturizer() + warn_msg = "No text columns were given to TextFeaturizer, component has no effect" + tf = TextFeaturizer() - tf.fit(X) + with pytest.warns(RuntimeWarning, match=warn_msg): + tf.fit(X) X_t = tf.transform(X) assert len(X_t.columns) == 2 From cb5617ae97135c1b59a1f81595821fae3297a1d7 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Thu, 6 Aug 2020 15:34:04 -0400 Subject: [PATCH 07/10] Clean up unnecessary code --- .../transformers/preprocessing/lsa.py | 20 +++++++------------ .../preprocessing/text_featurizer.py | 12 +++-------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index eefb4a1c26..3116c6d211 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -20,24 +20,18 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): text_colums (list): list of `pd.DataFrame` column names that contain text. random_state (int, np.random.RandomState): Seed for the random number generator. """ - text_columns = text_columns or [] parameters = {'text_columns': text_columns} + text_columns = text_columns or [] parameters.update(kwargs) - for i, col_name in enumerate(text_columns): - if not isinstance(col_name, str): - text_columns[i] = str(col_name) - self.text_col_names = text_columns - self.lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state)) + self.text_col_names = [str(col_name) for col_name in text_columns] + self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state)) super().__init__(parameters=parameters, component_obj=None, random_state=random_state) def _verify_col_names(self, col_names): - missing_cols = [] - for col in self.text_col_names: - if col not in col_names: - missing_cols.append(col) + missing_cols = [col for col in self.text_col_names if col not in col_names] if len(missing_cols) > 0: if len(missing_cols) == len(self.text_col_names): @@ -58,7 +52,7 @@ def fit(self, X, y=None): for col in self.text_col_names: corpus.extend(X[col].values.tolist()) - self.lsa_pipeline.fit(corpus) + self._lsa_pipeline.fit(corpus) return self def transform(self, X, y=None): @@ -75,10 +69,10 @@ def transform(self, X, y=None): for col in self.text_col_names: try: - transformed = self.lsa_pipeline.transform(X[col]) + transformed = self._lsa_pipeline.transform(X[col]) X_t = X_t.drop(labels=col, axis=1) except KeyError: - transformed = self.lsa_pipeline.transform(X[int(col)]) + transformed = self._lsa_pipeline.transform(X[int(col)]) X_t = X_t.drop(labels=int(col), axis=1) X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0]) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 7663053509..639995fb8f 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -24,14 +24,11 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): self._ft = import_or_raise("featuretools", error_msg="Package featuretools is not installed. Please install using `pip install featuretools[nlp_primitives].`") self._nlp_primitives = import_or_raise("nlp_primitives", error_msg="Package nlp_primitives is not installed. Please install using `pip install featuretools[nlp_primitives].`") - text_columns = text_columns or [] parameters = {'text_columns': text_columns} + text_columns = text_columns or [] parameters.update(kwargs) - for i, col_name in enumerate(text_columns): - if not isinstance(col_name, str): - text_columns[i] = str(col_name) - self.text_col_names = text_columns + self.text_col_names = [str(col_name) for col_name in text_columns] self._features = None self._lsa = LSA(text_columns=text_columns, random_state=random_state) super().__init__(parameters=parameters, @@ -49,10 +46,7 @@ def normalize(text): return X def _verify_col_names(self, col_names): - missing_cols = [] - for col in self.text_col_names: - if col not in col_names: - missing_cols.append(col) + missing_cols = [col for col in self.text_col_names if col not in col_names] if len(missing_cols) > 0: if len(missing_cols) == len(self.text_col_names): From a1d9a98b76158836f4321b26a26332a1622cd9be Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Fri, 7 Aug 2020 13:45:25 -0400 Subject: [PATCH 08/10] Address PR comments --- .../transformers/preprocessing/lsa.py | 20 +++++++--------- .../preprocessing/text_featurizer.py | 23 +++++++++---------- evalml/tests/component_tests/test_lsa.py | 5 +--- .../component_tests/test_text_featurizer.py | 5 +--- 4 files changed, 21 insertions(+), 32 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 3116c6d211..8dba3fca86 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -17,41 +17,37 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns. Arguments: - text_colums (list): list of `pd.DataFrame` column names that contain text. + text_columns (list): list of feature names which should be treated as text features. random_state (int, np.random.RandomState): Seed for the random number generator. """ parameters = {'text_columns': text_columns} text_columns = text_columns or [] parameters.update(kwargs) - self.text_col_names = [str(col_name) for col_name in text_columns] + self._text_col_names = [str(col_name) for col_name in text_columns] self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state)) super().__init__(parameters=parameters, component_obj=None, random_state=random_state) def _verify_col_names(self, col_names): - missing_cols = [col for col in self.text_col_names if col not in col_names] + missing_cols = [col for col in self._text_col_names if col not in col_names] if len(missing_cols) > 0: - if len(missing_cols) == len(self.text_col_names): + if len(missing_cols) == len(self._text_col_names): raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") for col in missing_cols: - self.text_col_names.remove(col) + self._text_col_names.remove(col) warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) def fit(self, X, y=None): - if len(self.text_col_names) == 0: - warnings.warn("No text columns were given to LSA, component has no effect", RuntimeWarning) + if len(self._text_col_names) == 0: return self if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X).rename(columns=str) self._verify_col_names(X.columns) - corpus = [] - for col in self.text_col_names: - corpus.extend(X[col].values.tolist()) - + corpus = X[self._text_col_names].values.flatten() self._lsa_pipeline.fit(corpus) return self @@ -67,7 +63,7 @@ def transform(self, X, y=None): X = pd.DataFrame(X) X_t = X - for col in self.text_col_names: + for col in self._text_col_names: try: transformed = self._lsa_pipeline.transform(X[col]) X_t = X_t.drop(labels=col, axis=1) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 639995fb8f..e4dcccb92a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -17,7 +17,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): """Extracts features from text columns using featuretools' nlp_primitives Arguments: - text_colums (list): list of `pd.DataFrame` column names that contain text. + text_columns (list): list of feature names which should be treated as text features. random_state (int, np.random.RandomState): Seed for the random number generator. """ @@ -28,7 +28,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): text_columns = text_columns or [] parameters.update(kwargs) - self.text_col_names = [str(col_name) for col_name in text_columns] + self._text_col_names = [str(col_name) for col_name in text_columns] self._features = None self._lsa = LSA(text_columns=text_columns, random_state=random_state) super().__init__(parameters=parameters, @@ -41,35 +41,34 @@ def normalize(text): text = text.translate(str.maketrans('', '', string.punctuation)) return text.lower() - for text_col in self.text_col_names: + for text_col in self._text_col_names: X[text_col] = X[text_col].apply(normalize) return X def _verify_col_names(self, col_names): - missing_cols = [col for col in self.text_col_names if col not in col_names] + missing_cols = [col for col in self._text_col_names if col not in col_names] if len(missing_cols) > 0: - if len(missing_cols) == len(self.text_col_names): + if len(missing_cols) == len(self._text_col_names): raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") for col in missing_cols: - self.text_col_names.remove(col) + self._text_col_names.remove(col) warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) def _verify_col_types(self, entity_set): var_types = entity_set.entities[0].variable_types - for col in self.text_col_names: + for col in self._text_col_names: if var_types[col] is not self._ft.variable_types.variable.Text: raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col)) def fit(self, X, y=None): - if len(self.text_col_names) == 0: - warnings.warn("No text columns were given to TextFeaturizer, component has no effect", RuntimeWarning) + if len(self._text_col_names) == 0: self._features = [] return self if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X).rename(columns=str) self._verify_col_names(X.columns) - X_text = X[self.text_col_names] + X_text = X[self._text_col_names] X_text['index'] = range(len(X_text)) es = self._ft.EntitySet() @@ -106,11 +105,11 @@ def transform(self, X, y=None): X = X.rename(columns=str) self._verify_col_names(X.columns) - X_text = X[self.text_col_names] + X_text = X[self._text_col_names] X_lsa = self._lsa.transform(X_text) X_text['index'] = range(len(X_text)) - X_t = X.drop(self.text_col_names, axis=1) + X_t = X.drop(self._text_col_names, axis=1) es = self._ft.EntitySet() es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index c6aaec9c7d..1540bc31e0 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -52,11 +52,8 @@ def test_lsa_with_nontext(text_df): def test_lsa_no_text(): X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) - warn_msg = "No text columns were given to LSA, component has no effect" lsa = LSA() - - with pytest.warns(RuntimeWarning, match=warn_msg): - lsa.fit(X) + lsa.fit(X) X_t = lsa.transform(X) assert len(X_t.columns) == 2 diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index 0aee8a5a8f..68f8728646 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -73,11 +73,8 @@ def test_featurizer_with_nontext(text_df): def test_featurizer_no_text(): X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) - warn_msg = "No text columns were given to TextFeaturizer, component has no effect" tf = TextFeaturizer() - - with pytest.warns(RuntimeWarning, match=warn_msg): - tf.fit(X) + tf.fit(X) X_t = tf.transform(X) assert len(X_t.columns) == 2 From e383731644328d16d24048327bd18c1fdc1d2785 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 10 Aug 2020 10:36:19 -0400 Subject: [PATCH 09/10] Raise warnings using logger instead of warnings package --- .../components/transformers/preprocessing/lsa.py | 9 ++++++--- .../transformers/preprocessing/text_featurizer.py | 6 ++++-- evalml/tests/component_tests/test_lsa.py | 7 +++++-- evalml/tests/component_tests/test_text_featurizer.py | 7 +++++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 8dba3fca86..028f615671 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -1,4 +1,4 @@ -import warnings +import logging import pandas as pd from sklearn.decomposition import TruncatedSVD @@ -7,6 +7,8 @@ from evalml.pipelines.components.transformers import Transformer +logger = logging.getLogger() + class LSA(Transformer): """Transformer to calculate the Latent Semantic Analysis Values of text input""" @@ -38,7 +40,7 @@ def _verify_col_names(self, col_names): raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") for col in missing_cols: self._text_col_names.remove(col) - warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) + logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols)) def fit(self, X, y=None): if len(self._text_col_names) == 0: @@ -57,7 +59,8 @@ def transform(self, X, y=None): X (pd.DataFrame): Data to transform y (pd.Series, optional): Targets Returns: - pd.DataFrame: Transformed X + pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the + format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1. """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index e4dcccb92a..22b7f7cc9a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -1,5 +1,5 @@ +import logging import string -import warnings import pandas as pd @@ -7,6 +7,8 @@ from evalml.pipelines.components.transformers.preprocessing import LSA from evalml.utils import import_or_raise +logger = logging.getLogger() + class TextFeaturizer(Transformer): """Transformer that can automatically featurize text columns.""" @@ -53,7 +55,7 @@ def _verify_col_names(self, col_names): raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") for col in missing_cols: self._text_col_names.remove(col) - warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) + logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols)) def _verify_col_types(self, entity_set): var_types = entity_set.entities[0].variable_types diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index 1540bc31e0..927123c376 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -1,3 +1,5 @@ +import logging + import numpy as np import pandas as pd import pytest @@ -58,12 +60,13 @@ def test_lsa_no_text(): assert len(X_t.columns) == 2 -def test_some_missing_col_names(text_df): +def test_some_missing_col_names(text_df, caplog): X = text_df lsa = LSA(text_columns=['col_1', 'col_2', 'col_3']) - with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"): + with caplog.at_level(logging.WARNING): lsa.fit(X) + assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages expected_col_names = set(['LSA(col_1)[0]', 'LSA(col_1)[1]', diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index 68f8728646..5cf50f00ef 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -1,3 +1,5 @@ +import logging + import numpy as np import pandas as pd import pytest @@ -79,12 +81,13 @@ def test_featurizer_no_text(): assert len(X_t.columns) == 2 -def test_some_missing_col_names(text_df): +def test_some_missing_col_names(text_df, caplog): X = text_df tf = TextFeaturizer(text_columns=['col_1', 'col_2', 'col_3']) - with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"): + with caplog.at_level(logging.WARNING): tf.fit(X) + assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages expected_col_names = set(['DIVERSITY_SCORE(col_1)', 'DIVERSITY_SCORE(col_2)', From 0ad6b0ae8894d69f9119d653f907c08ebf73564d Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Mon, 10 Aug 2020 16:07:38 -0400 Subject: [PATCH 10/10] PR comments --- .../transformers/preprocessing/lsa.py | 12 +++---- .../preprocessing/text_featurizer.py | 11 +++--- evalml/tests/component_tests/test_lsa.py | 35 +++++++++++++++++++ .../component_tests/test_text_featurizer.py | 30 ++++++++++++++++ 4 files changed, 74 insertions(+), 14 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py index 028f615671..1c4df0c215 100644 --- a/evalml/pipelines/components/transformers/preprocessing/lsa.py +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -26,7 +26,7 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): text_columns = text_columns or [] parameters.update(kwargs) - self._text_col_names = [str(col_name) for col_name in text_columns] + self._text_col_names = text_columns self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state)) super().__init__(parameters=parameters, component_obj=None, @@ -46,7 +46,7 @@ def fit(self, X, y=None): if len(self._text_col_names) == 0: return self if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X).rename(columns=str) + X = pd.DataFrame(X) self._verify_col_names(X.columns) corpus = X[self._text_col_names].values.flatten() @@ -67,13 +67,9 @@ def transform(self, X, y=None): X_t = X for col in self._text_col_names: - try: - transformed = self._lsa_pipeline.transform(X[col]) - X_t = X_t.drop(labels=col, axis=1) - except KeyError: - transformed = self._lsa_pipeline.transform(X[int(col)]) - X_t = X_t.drop(labels=int(col), axis=1) + transformed = self._lsa_pipeline.transform(X[col]) X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0]) X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1]) + X_t = X_t.drop(columns=self._text_col_names) return X_t diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index 22b7f7cc9a..4fa2c7e572 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -30,9 +30,9 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): text_columns = text_columns or [] parameters.update(kwargs) - self._text_col_names = [str(col_name) for col_name in text_columns] self._features = None self._lsa = LSA(text_columns=text_columns, random_state=random_state) + self._text_col_names = text_columns super().__init__(parameters=parameters, component_obj=None, random_state=random_state) @@ -60,7 +60,7 @@ def _verify_col_names(self, col_names): def _verify_col_types(self, entity_set): var_types = entity_set.entities[0].variable_types for col in self._text_col_names: - if var_types[col] is not self._ft.variable_types.variable.Text: + if var_types[str(col)] is not self._ft.variable_types.variable.Text: raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col)) def fit(self, X, y=None): @@ -68,13 +68,13 @@ def fit(self, X, y=None): self._features = [] return self if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X).rename(columns=str) + X = pd.DataFrame(X) self._verify_col_names(X.columns) X_text = X[self._text_col_names] X_text['index'] = range(len(X_text)) es = self._ft.EntitySet() - es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') + es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index') self._verify_col_types(es) es.df = self._clean_text(X) @@ -104,7 +104,6 @@ def transform(self, X, y=None): X = pd.DataFrame(X) if self._features is None or len(self._features) == 0: return X - X = X.rename(columns=str) self._verify_col_names(X.columns) X_text = X[self._text_col_names] @@ -114,7 +113,7 @@ def transform(self, X, y=None): X_t = X.drop(self._text_col_names, axis=1) es = self._ft.EntitySet() - es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') + es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index') self._verify_col_types(es) es.df = self._clean_text(X) diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py index 927123c376..afde0f0566 100644 --- a/evalml/tests/component_tests/test_lsa.py +++ b/evalml/tests/component_tests/test_lsa.py @@ -111,6 +111,41 @@ def test_index_col_names(): assert X_t.dtypes.all() == np.float64 +def test_int_col_names(): + X = pd.DataFrame( + {4.75: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', + 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', + 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], + -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past'] + }) + lsa = LSA(text_columns=[4.75, -1]) + lsa.fit(X) + expected_col_names = set(['LSA(4.75)[0]', + 'LSA(4.75)[1]', + 'LSA(-1)[0]', + 'LSA(-1)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_repeat_col_names(): + X = pd.DataFrame(data=np.array([['identical string one', 'identical string one'], + ['second double string', 'second double string'], + ['copy the third', 'copy the third']]), columns=['col_1', 'col_1']) + lsa = LSA(text_columns=['col_1', 'col_1']) + lsa.fit(X) + expected_col_names = ['LSA(col_1)[0]', + 'LSA(col_1)[1]'] + X_t = lsa.transform(X) + np.testing.assert_array_equal(X_t.columns, np.array(expected_col_names)) + assert len(X_t.columns) == 2 + assert X_t.dtypes.all() == np.float64 + + def test_lsa_output(): X = pd.DataFrame( {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index 5cf50f00ef..830dd89302 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -160,6 +160,36 @@ def test_index_col_names(): assert X_t.dtypes.all() == np.float64 +def test_int_col_names(): + X = pd.DataFrame( + {475: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', + 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', + 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], + -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past'] + }) + tf = TextFeaturizer(text_columns=[475, -1]) + tf.fit(X) + expected_col_names = set(['DIVERSITY_SCORE(475)', + 'DIVERSITY_SCORE(-1)', + 'LSA(475)[0]', + 'LSA(475)[1]', + 'LSA(-1)[0]', + 'LSA(-1)[1]', + 'MEAN_CHARACTERS_PER_WORD(475)', + 'MEAN_CHARACTERS_PER_WORD(-1)', + 'POLARITY_SCORE(475)', + 'POLARITY_SCORE(-1)']) + for i in range(15): + expected_col_names.add(f'PART_OF_SPEECH_COUNT(475)[{i}]') + expected_col_names.add(f'PART_OF_SPEECH_COUNT(-1)[{i}]') + X_t = tf.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 40 + assert X_t.dtypes.all() == np.float64 + + def test_diversity_primitive_output(): X = pd.DataFrame( {'diverse': ['This is a very diverse string which does not contain any repeated words at all',