Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LSA Component #1022

Merged
merged 13 commits into from
Aug 11, 2020
3 changes: 3 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ Release Notes

**Future Releases**
* Enhancements
* Added new LSA component for text featurization :pr:`1022`
* Fixes
* Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
* Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
eccabay marked this conversation as resolved.
Show resolved Hide resolved
* Changes
* Removed DeprecationWarning for SimpleImputer :pr:`1018`
* Documentation Changes
Expand Down
3 changes: 2 additions & 1 deletion evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@
DropNullColumns,
DateTimeFeaturizer,
SelectColumns,
TextFeaturizer
TextFeaturizer,
LSA,
)
2 changes: 1 addition & 1 deletion evalml/pipelines/components/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
from .imputers import PerColumnImputer, SimpleImputer, Imputer
from .scalers import StandardScaler
from .column_selectors import DropColumns, SelectColumns
from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer
from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# flake8:noqa
from .datetime_featurizer import DateTimeFeaturizer
from .drop_null_columns import DropNullColumns
from .lsa import LSA
from .text_featurizer import TextFeaturizer
86 changes: 86 additions & 0 deletions evalml/pipelines/components/transformers/preprocessing/lsa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import warnings

import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

from evalml.pipelines.components.transformers import Transformer


class LSA(Transformer):
"""Transformer to calculate the Latent Semantic Analysis Values of text input"""
name = "LSA Transformer"
hyperparameter_ranges = {}

def __init__(self, text_columns=None, random_state=0, **kwargs):
"""Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns.

Arguments:
text_colums (list): list of `pd.DataFrame` column names that contain text.
eccabay marked this conversation as resolved.
Show resolved Hide resolved
random_state (int, np.random.RandomState): Seed for the random number generator.
"""
text_columns = text_columns or []
parameters = {'text_columns': text_columns}
eccabay marked this conversation as resolved.
Show resolved Hide resolved
parameters.update(kwargs)

for i, col_name in enumerate(text_columns):
if not isinstance(col_name, str):
text_columns[i] = str(col_name)
self.text_col_names = text_columns
eccabay marked this conversation as resolved.
Show resolved Hide resolved
self.lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
eccabay marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(parameters=parameters,
component_obj=None,
random_state=random_state)

def _verify_col_names(self, col_names):
missing_cols = []
for col in self.text_col_names:
if col not in col_names:
missing_cols.append(col)
eccabay marked this conversation as resolved.
Show resolved Hide resolved

if len(missing_cols) > 0:
if len(missing_cols) == len(self.text_col_names):
raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
for col in missing_cols:
self.text_col_names.remove(col)
warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
eccabay marked this conversation as resolved.
Show resolved Hide resolved

def fit(self, X, y=None):
eccabay marked this conversation as resolved.
Show resolved Hide resolved
if len(self.text_col_names) == 0:
warnings.warn("No text columns were given to LSA, component has no effect", RuntimeWarning)
return self
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X).rename(columns=str)
eccabay marked this conversation as resolved.
Show resolved Hide resolved
self._verify_col_names(X.columns)

corpus = []
for col in self.text_col_names:
corpus.extend(X[col].values.tolist())
eccabay marked this conversation as resolved.
Show resolved Hide resolved

self.lsa_pipeline.fit(corpus)
return self

def transform(self, X, y=None):
"""Transforms data X by applying the LSA pipeline.
Arguments:
X (pd.DataFrame): Data to transform
y (pd.Series, optional): Targets
Returns:
pd.DataFrame: Transformed X
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X_t = X

for col in self.text_col_names:
try:
transformed = self.lsa_pipeline.transform(X[col])
X_t = X_t.drop(labels=col, axis=1)
except KeyError:
transformed = self.lsa_pipeline.transform(X[int(col)])
X_t = X_t.drop(labels=int(col), axis=1)
eccabay marked this conversation as resolved.
Show resolved Hide resolved

X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
eccabay marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not blocking: what do you think of doing this for the naming: LSA(my_feature, 0) and LSA(my_feature, 1) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like it! I only kept this formatting to mirror what the primitives' generated column names look like, but I can change this if you'd prefer.

return X_t
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.transformers.preprocessing import LSA
from evalml.utils import import_or_raise


Expand All @@ -27,13 +28,12 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
parameters = {'text_columns': text_columns}
parameters.update(kwargs)

if len(text_columns) == 0:
warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning)
Comment on lines -30 to -31
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this warning from __init__ to fit to temporarily resolve #1017

for i, col_name in enumerate(text_columns):
if not isinstance(col_name, str):
text_columns[i] = str(col_name)
self.text_col_names = text_columns
self._features = None
self._lsa = LSA(text_columns=text_columns, random_state=random_state)
eccabay marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(parameters=parameters,
component_obj=None,
random_state=random_state)
Expand Down Expand Up @@ -69,6 +69,7 @@ def _verify_col_types(self, entity_set):

def fit(self, X, y=None):
if len(self.text_col_names) == 0:
warnings.warn("No text columns were given to TextFeaturizer, component has no effect", RuntimeWarning)
eccabay marked this conversation as resolved.
Show resolved Hide resolved
self._features = []
return self
if not isinstance(X, pd.DataFrame):
Expand All @@ -83,11 +84,11 @@ def fit(self, X, y=None):
es.df = self._clean_text(X)

trans = [self._nlp_primitives.DiversityScore,
self._nlp_primitives.LSA,
self._nlp_primitives.MeanCharactersPerWord,
self._nlp_primitives.PartOfSpeechCount,
self._nlp_primitives.PolarityScore]

self._lsa.fit(X)
self._features = self._ft.dfs(entityset=es,
target_entity='X',
trans_primitives=trans,
Expand All @@ -112,6 +113,8 @@ def transform(self, X, y=None):
self._verify_col_names(X.columns)

X_text = X[self.text_col_names]
X_lsa = self._lsa.transform(X_text)

X_text['index'] = range(len(X_text))
X_t = X.drop(self.text_col_names, axis=1)

Expand All @@ -123,5 +126,5 @@ def transform(self, X, y=None):
feature_matrix = self._ft.calculate_feature_matrix(features=self._features,
entityset=es,
verbose=True)
X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1)
X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eccabay why is the feature_matrix.reindex(X.index) necessary? I know that's not part of this PR, I'm just poking around finding ways we can simplify our row/column indexing across the board.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I recall correctly, it was because the outputted feature_matrix sets its own indices, so it helped to reset to what was originally given.

return X_t
128 changes: 128 additions & 0 deletions evalml/tests/component_tests/test_lsa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import numpy as np
import pandas as pd
import pytest

from evalml.pipelines.components import LSA


@pytest.fixture()
def text_df():
df = pd.DataFrame(
{'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!'],
'col_2': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
'I dreamed a dream in days gone by, when hope was high and life worth living',
'Red, the blood of angry men - black, the dark of ages past']
})
yield df


def test_lsa_only_text(text_df):
X = text_df
lsa = LSA(text_columns=['col_1', 'col_2'])
lsa.fit(X)

expected_col_names = set(['LSA(col_1)[0]',
'LSA(col_1)[1]',
'LSA(col_2)[0]',
'LSA(col_2)[1]'])
X_t = lsa.transform(X)
assert set(X_t.columns) == expected_col_names
assert len(X_t.columns) == 4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit-pick: I feel like this line is covered by set(X_t.columns) == expected_col_names so maybe not necessary? (same with other tests!)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought so as well at first, but this line actually helped me catch a bug yesterday! Since we take the set of X_t.columns, any columns with duplicate names will not cause that line to fail -- checking the number of columns explicitly prevents that from slipping through the cracks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooo huh, I didn't even know duplicate names were allowed but makes sense! 😊

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, @angela97lin , you can do fancy stuff in pandas.

df = pd.DataFrame(data=np.array([[1, 1], [2, 2], [3, 3]]), columns=['a', 'a'])

produces a df with two columns which happen to have the same name, although they occupy different positions in the column index.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For these tests, let's do a direct comparison of the column names:

expected_col_names = np.array(...) # expected str values
np.testing.assert_equal(X_t.columns, expected_col_names)

This has the added benefit of covering the column name order.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, the column order as outputted by featuretools changes, and as far as I can tell there's no option to fix it. @dsherry would you rather I enforce a column order by sorting, say, alphabetically, or leave this test as is?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's good to know. Your call.

Copy link
Contributor Author

@eccabay eccabay Aug 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to leave this as is, since enforcing an order makes the test bulkier.

assert X_t.dtypes.all() == np.float64


def test_lsa_with_nontext(text_df):
X = text_df
X['col_3'] = [73.7, 67.213, 92]
lsa = LSA(text_columns=['col_1', 'col_2'])

lsa.fit(X)
expected_col_names = set(['LSA(col_1)[0]',
'LSA(col_1)[1]',
'LSA(col_2)[0]',
'LSA(col_2)[1]',
'col_3'])
X_t = lsa.transform(X)
assert set(X_t.columns) == expected_col_names
assert len(X_t.columns) == 5
assert X_t.dtypes.all() == np.float64
eccabay marked this conversation as resolved.
Show resolved Hide resolved


def test_lsa_no_text():
X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
warn_msg = "No text columns were given to LSA, component has no effect"
lsa = LSA()

with pytest.warns(RuntimeWarning, match=warn_msg):
lsa.fit(X)
X_t = lsa.transform(X)
assert len(X_t.columns) == 2


def test_some_missing_col_names(text_df):
X = text_df
lsa = LSA(text_columns=['col_1', 'col_2', 'col_3'])

with pytest.warns(RuntimeWarning, match="not found in the given DataFrame"):
lsa.fit(X)

expected_col_names = set(['LSA(col_1)[0]',
'LSA(col_1)[1]',
'LSA(col_2)[0]',
'LSA(col_2)[1]'])
X_t = lsa.transform(X)
assert set(X_t.columns) == expected_col_names
assert len(X_t.columns) == 4
assert X_t.dtypes.all() == np.float64


def test_all_missing_col_names(text_df):
X = text_df
lsa = LSA(text_columns=['col_3', 'col_4'])

error_msg = "None of the provided text column names match the columns in the given DataFrame"
with pytest.raises(RuntimeError, match=error_msg):
lsa.fit(X)


def test_empty_text_column():
X = pd.DataFrame({'col_1': []})
lsa = LSA(text_columns=['col_1'])
with pytest.raises(ValueError, match="empty vocabulary"):
lsa.fit(X)


def test_index_col_names():
X = np.array([['I\'m singing in the rain!$%^ do do do do do da do', 'do you hear the people sing?////////////////////////////////////'],
['just singing in the rain.................. \n', 'singing the songs of angry men\n'],
['\t\n\n\n\nWhat a glorious feelinggggggggggg, I\'m happy again!!! lalalalalalalalalalala', '\tIt is the music of a people who will NOT be slaves again!!!!!!!!!!!']])
lsa = LSA(text_columns=[0, 1])

lsa.fit(X)
expected_col_names = set(['LSA(0)[0]',
'LSA(0)[1]',
'LSA(1)[0]',
'LSA(1)[1]'])
X_t = lsa.transform(X)
assert set(X_t.columns) == expected_col_names
assert len(X_t.columns) == 4
assert X_t.dtypes.all() == np.float64


def test_lsa_output():
X = pd.DataFrame(
{'lsa': ['do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
'I dreamed a dream in days gone by, when hope was high and life worth living',
'Red, the blood of angry men - black, the dark of ages past']})
lsa = LSA(text_columns=['lsa'])
lsa.fit(X)

expected_features = [[0.832, 0.],
[0., 1.],
[0.832, 0.]]
X_t = lsa.transform(X)
cols = [col for col in X_t.columns if 'LSA' in col]
features = X_t[cols]
np.testing.assert_almost_equal(features, expected_features, decimal=3)
14 changes: 7 additions & 7 deletions evalml/tests/component_tests/test_text_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ def test_featurizer_with_nontext(text_df):

def test_featurizer_no_text():
X = pd.DataFrame({'col_1': [1, 2, 3], 'col_2': [4, 5, 6]})
warn_msg = "No text columns were given to TextFeaturizer, component will have no effect"
with pytest.warns(RuntimeWarning, match=warn_msg):
tf = TextFeaturizer()
warn_msg = "No text columns were given to TextFeaturizer, component has no effect"
eccabay marked this conversation as resolved.
Show resolved Hide resolved
tf = TextFeaturizer()

tf.fit(X)
with pytest.warns(RuntimeWarning, match=warn_msg):
tf.fit(X)
X_t = tf.transform(X)
assert len(X_t.columns) == 2

Expand Down Expand Up @@ -182,9 +182,9 @@ def test_lsa_primitive_output():
tf = TextFeaturizer(text_columns=['lsa'])
tf.fit(X)

expected_features = [[0.0200961, 0.002976],
[0.0223392, 0.0058817],
[0.0186072, -0.0006121]]
expected_features = [[0.832, 0.],
[0., 1.],
[0.832, 0.]]
X_t = tf.transform(X)
cols = [col for col in X_t.columns if 'LSA' in col]
features = X_t[cols]
Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/component_tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

def test_all_components(has_minimal_dependencies):
if has_minimal_dependencies:
assert len(all_components) == 22
assert len(all_components) == 23
else:
assert len(all_components) == 26
assert len(all_components) == 27


def test_handle_component_class_names():
Expand Down