diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 6f91eb71e9..6da96e7001 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -5,7 +5,10 @@ Release Notes * Enhancements * Split `fill_value` into `categorical_fill_value` and `numeric_fill_value` for Imputer :pr:`1019` * Added `explain_predictions` and `explain_predictions_best_worst` for explaining multiple predictions with SHAP :pr:`1016` + * Added new LSA component for text featurization :pr:`1022` * Fixes + * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022` + * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022` * Changes * Documentation Changes * Update setup.py URL to point to the github repo :pr:`1037` diff --git a/evalml/pipelines/components/__init__.py b/evalml/pipelines/components/__init__.py index edf5e14a13..b42de8ebe6 100644 --- a/evalml/pipelines/components/__init__.py +++ b/evalml/pipelines/components/__init__.py @@ -32,5 +32,6 @@ DropNullColumns, DateTimeFeaturizer, SelectColumns, - TextFeaturizer + TextFeaturizer, + LSA, ) diff --git a/evalml/pipelines/components/transformers/__init__.py b/evalml/pipelines/components/transformers/__init__.py index 81f20cb123..7fa863f738 100644 --- a/evalml/pipelines/components/transformers/__init__.py +++ b/evalml/pipelines/components/transformers/__init__.py @@ -5,4 +5,4 @@ from .imputers import PerColumnImputer, SimpleImputer, Imputer from .scalers import StandardScaler from .column_selectors import DropColumns, SelectColumns -from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer +from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer diff --git a/evalml/pipelines/components/transformers/preprocessing/__init__.py b/evalml/pipelines/components/transformers/preprocessing/__init__.py index a680656d10..2ad5a2f9dc 100644 --- a/evalml/pipelines/components/transformers/preprocessing/__init__.py +++ b/evalml/pipelines/components/transformers/preprocessing/__init__.py @@ -1,4 +1,5 @@ # flake8:noqa from .datetime_featurizer import DateTimeFeaturizer from .drop_null_columns import DropNullColumns +from .lsa import LSA from .text_featurizer import TextFeaturizer diff --git a/evalml/pipelines/components/transformers/preprocessing/lsa.py b/evalml/pipelines/components/transformers/preprocessing/lsa.py new file mode 100644 index 0000000000..1c4df0c215 --- /dev/null +++ b/evalml/pipelines/components/transformers/preprocessing/lsa.py @@ -0,0 +1,75 @@ +import logging + +import pandas as pd +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import make_pipeline + +from evalml.pipelines.components.transformers import Transformer + +logger = logging.getLogger() + + +class LSA(Transformer): + """Transformer to calculate the Latent Semantic Analysis Values of text input""" + name = "LSA Transformer" + hyperparameter_ranges = {} + + def __init__(self, text_columns=None, random_state=0, **kwargs): + """Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns. + + Arguments: + text_columns (list): list of feature names which should be treated as text features. + random_state (int, np.random.RandomState): Seed for the random number generator. + """ + parameters = {'text_columns': text_columns} + text_columns = text_columns or [] + parameters.update(kwargs) + + self._text_col_names = text_columns + self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state)) + super().__init__(parameters=parameters, + component_obj=None, + random_state=random_state) + + def _verify_col_names(self, col_names): + missing_cols = [col for col in self._text_col_names if col not in col_names] + + if len(missing_cols) > 0: + if len(missing_cols) == len(self._text_col_names): + raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") + for col in missing_cols: + self._text_col_names.remove(col) + logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols)) + + def fit(self, X, y=None): + if len(self._text_col_names) == 0: + return self + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + self._verify_col_names(X.columns) + + corpus = X[self._text_col_names].values.flatten() + self._lsa_pipeline.fit(corpus) + return self + + def transform(self, X, y=None): + """Transforms data X by applying the LSA pipeline. + Arguments: + X (pd.DataFrame): Data to transform + y (pd.Series, optional): Targets + Returns: + pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the + format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1. + """ + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + X_t = X + + for col in self._text_col_names: + transformed = self._lsa_pipeline.transform(X[col]) + + X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0]) + X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1]) + X_t = X_t.drop(columns=self._text_col_names) + return X_t diff --git a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py index ce244062bf..4fa2c7e572 100644 --- a/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/text_featurizer.py @@ -1,11 +1,14 @@ +import logging import string -import warnings import pandas as pd from evalml.pipelines.components.transformers import Transformer +from evalml.pipelines.components.transformers.preprocessing import LSA from evalml.utils import import_or_raise +logger = logging.getLogger() + class TextFeaturizer(Transformer): """Transformer that can automatically featurize text columns.""" @@ -16,24 +19,20 @@ def __init__(self, text_columns=None, random_state=0, **kwargs): """Extracts features from text columns using featuretools' nlp_primitives Arguments: - text_colums (list): list of `pd.DataFrame` column names that contain text. + text_columns (list): list of feature names which should be treated as text features. random_state (int, np.random.RandomState): Seed for the random number generator. """ self._ft = import_or_raise("featuretools", error_msg="Package featuretools is not installed. Please install using `pip install featuretools[nlp_primitives].`") self._nlp_primitives = import_or_raise("nlp_primitives", error_msg="Package nlp_primitives is not installed. Please install using `pip install featuretools[nlp_primitives].`") - text_columns = text_columns or [] parameters = {'text_columns': text_columns} + text_columns = text_columns or [] parameters.update(kwargs) - if len(text_columns) == 0: - warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning) - for i, col_name in enumerate(text_columns): - if not isinstance(col_name, str): - text_columns[i] = str(col_name) - self.text_col_names = text_columns self._features = None + self._lsa = LSA(text_columns=text_columns, random_state=random_state) + self._text_col_names = text_columns super().__init__(parameters=parameters, component_obj=None, random_state=random_state) @@ -44,50 +43,47 @@ def normalize(text): text = text.translate(str.maketrans('', '', string.punctuation)) return text.lower() - for text_col in self.text_col_names: + for text_col in self._text_col_names: X[text_col] = X[text_col].apply(normalize) return X def _verify_col_names(self, col_names): - missing_cols = [] - for col in self.text_col_names: - if col not in col_names: - missing_cols.append(col) + missing_cols = [col for col in self._text_col_names if col not in col_names] if len(missing_cols) > 0: - if len(missing_cols) == len(self.text_col_names): + if len(missing_cols) == len(self._text_col_names): raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") for col in missing_cols: - self.text_col_names.remove(col) - warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) + self._text_col_names.remove(col) + logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols)) def _verify_col_types(self, entity_set): var_types = entity_set.entities[0].variable_types - for col in self.text_col_names: - if var_types[col] is not self._ft.variable_types.variable.Text: + for col in self._text_col_names: + if var_types[str(col)] is not self._ft.variable_types.variable.Text: raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col)) def fit(self, X, y=None): - if len(self.text_col_names) == 0: + if len(self._text_col_names) == 0: self._features = [] return self if not isinstance(X, pd.DataFrame): - X = pd.DataFrame(X).rename(columns=str) + X = pd.DataFrame(X) self._verify_col_names(X.columns) - X_text = X[self.text_col_names] + X_text = X[self._text_col_names] X_text['index'] = range(len(X_text)) es = self._ft.EntitySet() - es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') + es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index') self._verify_col_types(es) es.df = self._clean_text(X) trans = [self._nlp_primitives.DiversityScore, - self._nlp_primitives.LSA, self._nlp_primitives.MeanCharactersPerWord, self._nlp_primitives.PartOfSpeechCount, self._nlp_primitives.PolarityScore] + self._lsa.fit(X) self._features = self._ft.dfs(entityset=es, target_entity='X', trans_primitives=trans, @@ -108,20 +104,21 @@ def transform(self, X, y=None): X = pd.DataFrame(X) if self._features is None or len(self._features) == 0: return X - X = X.rename(columns=str) self._verify_col_names(X.columns) - X_text = X[self.text_col_names] + X_text = X[self._text_col_names] + X_lsa = self._lsa.transform(X_text) + X_text['index'] = range(len(X_text)) - X_t = X.drop(self.text_col_names, axis=1) + X_t = X.drop(self._text_col_names, axis=1) es = self._ft.EntitySet() - es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') + es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index') self._verify_col_types(es) es.df = self._clean_text(X) feature_matrix = self._ft.calculate_feature_matrix(features=self._features, entityset=es, verbose=True) - X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1) + X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1) return X_t diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py new file mode 100644 index 0000000000..afde0f0566 --- /dev/null +++ b/evalml/tests/component_tests/test_lsa.py @@ -0,0 +1,163 @@ +import logging + +import numpy as np +import pandas as pd +import pytest + +from evalml.pipelines.components import LSA + + +@pytest.fixture() +def text_df(): + df = pd.DataFrame( + {'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', + 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', + 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], + 'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past'] + }) + yield df + + +def test_lsa_only_text(text_df): + X = text_df + lsa = LSA(text_columns=['col_1', 'col_2']) + lsa.fit(X) + + expected_col_names = set(['LSA(col_1)[0]', + 'LSA(col_1)[1]', + 'LSA(col_2)[0]', + 'LSA(col_2)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_lsa_with_nontext(text_df): + X = text_df + X['col_3'] = [73.7, 67.213, 92] + lsa = LSA(text_columns=['col_1', 'col_2']) + + lsa.fit(X) + expected_col_names = set(['LSA(col_1)[0]', + 'LSA(col_1)[1]', + 'LSA(col_2)[0]', + 'LSA(col_2)[1]', + 'col_3']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 5 + assert X_t.dtypes.all() == np.float64 + + +def test_lsa_no_text(): + X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) + lsa = LSA() + lsa.fit(X) + X_t = lsa.transform(X) + assert len(X_t.columns) == 2 + + +def test_some_missing_col_names(text_df, caplog): + X = text_df + lsa = LSA(text_columns=['col_1', 'col_2', 'col_3']) + + with caplog.at_level(logging.WARNING): + lsa.fit(X) + assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages + + expected_col_names = set(['LSA(col_1)[0]', + 'LSA(col_1)[1]', + 'LSA(col_2)[0]', + 'LSA(col_2)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_all_missing_col_names(text_df): + X = text_df + lsa = LSA(text_columns=['col_3', 'col_4']) + + error_msg = "None of the provided text column names match the columns in the given DataFrame" + with pytest.raises(RuntimeError, match=error_msg): + lsa.fit(X) + + +def test_empty_text_column(): + X = pd.DataFrame({'col_1': []}) + lsa = LSA(text_columns=['col_1']) + with pytest.raises(ValueError, match="empty vocabulary"): + lsa.fit(X) + + +def test_index_col_names(): + X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'], + ['just singing in the rain.................. \n', 'singing the songs of angry men\n'], + ['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']]) + lsa = LSA(text_columns=[0, 1]) + + lsa.fit(X) + expected_col_names = set(['LSA(0)[0]', + 'LSA(0)[1]', + 'LSA(1)[0]', + 'LSA(1)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_int_col_names(): + X = pd.DataFrame( + {4.75: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', + 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', + 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], + -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past'] + }) + lsa = LSA(text_columns=[4.75, -1]) + lsa.fit(X) + expected_col_names = set(['LSA(4.75)[0]', + 'LSA(4.75)[1]', + 'LSA(-1)[0]', + 'LSA(-1)[1]']) + X_t = lsa.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 4 + assert X_t.dtypes.all() == np.float64 + + +def test_repeat_col_names(): + X = pd.DataFrame(data=np.array([['identical string one', 'identical string one'], + ['second double string', 'second double string'], + ['copy the third', 'copy the third']]), columns=['col_1', 'col_1']) + lsa = LSA(text_columns=['col_1', 'col_1']) + lsa.fit(X) + expected_col_names = ['LSA(col_1)[0]', + 'LSA(col_1)[1]'] + X_t = lsa.transform(X) + np.testing.assert_array_equal(X_t.columns, np.array(expected_col_names)) + assert len(X_t.columns) == 2 + assert X_t.dtypes.all() == np.float64 + + +def test_lsa_output(): + X = pd.DataFrame( + {'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past']}) + lsa = LSA(text_columns=['lsa']) + lsa.fit(X) + + expected_features = [[0.832, 0.], + [0., 1.], + [0.832, 0.]] + X_t = lsa.transform(X) + cols = [col for col in X_t.columns if 'LSA' in col] + features = X_t[cols] + np.testing.assert_almost_equal(features, expected_features, decimal=3) diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py index ed24a497bd..830dd89302 100644 --- a/evalml/tests/component_tests/test_text_featurizer.py +++ b/evalml/tests/component_tests/test_text_featurizer.py @@ -1,3 +1,5 @@ +import logging + import numpy as np import pandas as pd import pytest @@ -73,21 +75,19 @@ def test_featurizer_with_nontext(text_df): def test_featurizer_no_text(): X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]}) - warn_msg = "No text columns were given to TextFeaturizer, component will have no effect" - with pytest.warns(RuntimeWarning, match=warn_msg): - tf = TextFeaturizer() - + tf = TextFeaturizer() tf.fit(X) X_t = tf.transform(X) assert len(X_t.columns) == 2 -def test_some_missing_col_names(text_df): +def test_some_missing_col_names(text_df, caplog): X = text_df tf = TextFeaturizer(text_columns=['col_1', 'col_2', 'col_3']) - with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"): + with caplog.at_level(logging.WARNING): tf.fit(X) + assert "Columns ['col_3'] were not found in the given DataFrame, ignoring" in caplog.messages expected_col_names = set(['DIVERSITY_SCORE(col_1)', 'DIVERSITY_SCORE(col_2)', @@ -160,6 +160,36 @@ def test_index_col_names(): assert X_t.dtypes.all() == np.float64 +def test_int_col_names(): + X = pd.DataFrame( + {475: ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!', + 'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.', + 'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'], + -1: ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!', + 'I dreamed a dream in days gone by, when hope was high and life worth living', + 'Red, the blood of angry men - black, the dark of ages past'] + }) + tf = TextFeaturizer(text_columns=[475, -1]) + tf.fit(X) + expected_col_names = set(['DIVERSITY_SCORE(475)', + 'DIVERSITY_SCORE(-1)', + 'LSA(475)[0]', + 'LSA(475)[1]', + 'LSA(-1)[0]', + 'LSA(-1)[1]', + 'MEAN_CHARACTERS_PER_WORD(475)', + 'MEAN_CHARACTERS_PER_WORD(-1)', + 'POLARITY_SCORE(475)', + 'POLARITY_SCORE(-1)']) + for i in range(15): + expected_col_names.add(f'PART_OF_SPEECH_COUNT(475)[{i}]') + expected_col_names.add(f'PART_OF_SPEECH_COUNT(-1)[{i}]') + X_t = tf.transform(X) + assert set(X_t.columns) == expected_col_names + assert len(X_t.columns) == 40 + assert X_t.dtypes.all() == np.float64 + + def test_diversity_primitive_output(): X = pd.DataFrame( {'diverse': ['This is a very diverse string which does not contain any repeated words at all', @@ -182,9 +212,9 @@ def test_lsa_primitive_output(): tf = TextFeaturizer(text_columns=['lsa']) tf.fit(X) - expected_features = [[0.0200961, 0.002976], - [0.0223392, 0.0058817], - [0.0186072, -0.0006121]] + expected_features = [[0.832, 0.], + [0., 1.], + [0.832, 0.]] X_t = tf.transform(X) cols = [col for col in X_t.columns if 'LSA' in col] features = X_t[cols] diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py index bf960e5141..5a7294df6b 100644 --- a/evalml/tests/component_tests/test_utils.py +++ b/evalml/tests/component_tests/test_utils.py @@ -12,9 +12,9 @@ def test_all_components(has_minimal_dependencies): if has_minimal_dependencies: - assert len(all_components) == 22 + assert len(all_components) == 23 else: - assert len(all_components) == 26 + assert len(all_components) == 27 def test_handle_component_class_names():