Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LSA Component #1022

Merged
merged 13 commits into from
Aug 11, 2020
3 changes: 3 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ Release Notes
* Enhancements
* Split `fill_value` into `categorical_fill_value` and `numeric_fill_value` for Imputer :pr:`1019`
* Added `explain_predictions` and `explain_predictions_best_worst` for explaining multiple predictions with SHAP :pr:`1016`
* Added new LSA component for text featurization :pr:`1022`
* Fixes
* Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
* Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
eccabay marked this conversation as resolved.
Show resolved Hide resolved
* Changes
* Documentation Changes
* Update setup.py URL to point to the github repo :pr:`1037`
Expand Down
3 changes: 2 additions & 1 deletion evalml/pipelines/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@
DropNullColumns,
DateTimeFeaturizer,
SelectColumns,
TextFeaturizer
TextFeaturizer,
LSA,
)
2 changes: 1 addition & 1 deletion evalml/pipelines/components/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
from .imputers import PerColumnImputer, SimpleImputer, Imputer
from .scalers import StandardScaler
from .column_selectors import DropColumns, SelectColumns
from .preprocessing import DateTimeFeaturizer, DropNullColumns, TextFeaturizer
from .preprocessing import DateTimeFeaturizer, DropNullColumns, LSA, TextFeaturizer
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# flake8:noqa
from .datetime_featurizer import DateTimeFeaturizer
from .drop_null_columns import DropNullColumns
from .lsa import LSA
from .text_featurizer import TextFeaturizer
75 changes: 75 additions & 0 deletions evalml/pipelines/components/transformers/preprocessing/lsa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import logging

import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

from evalml.pipelines.components.transformers import Transformer

logger = logging.getLogger()


class LSA(Transformer):
"""Transformer to calculate the Latent Semantic Analysis Values of text input"""
name = "LSA Transformer"
hyperparameter_ranges = {}

def __init__(self, text_columns=None, random_state=0, **kwargs):
"""Creates a transformer to perform TF-IDF transformation and Singular Value Decomposition for text columns.

Arguments:
text_columns (list): list of feature names which should be treated as text features.
random_state (int, np.random.RandomState): Seed for the random number generator.
"""
parameters = {'text_columns': text_columns}
eccabay marked this conversation as resolved.
Show resolved Hide resolved
text_columns = text_columns or []
parameters.update(kwargs)

self._text_col_names = text_columns
self._lsa_pipeline = make_pipeline(TfidfVectorizer(), TruncatedSVD(random_state=random_state))
super().__init__(parameters=parameters,
component_obj=None,
random_state=random_state)

def _verify_col_names(self, col_names):
missing_cols = [col for col in self._text_col_names if col not in col_names]

if len(missing_cols) > 0:
if len(missing_cols) == len(self._text_col_names):
raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
for col in missing_cols:
self._text_col_names.remove(col)
logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols))

def fit(self, X, y=None):
eccabay marked this conversation as resolved.
Show resolved Hide resolved
if len(self._text_col_names) == 0:
return self
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self._verify_col_names(X.columns)

corpus = X[self._text_col_names].values.flatten()
self._lsa_pipeline.fit(corpus)
return self

def transform(self, X, y=None):
"""Transforms data X by applying the LSA pipeline.
Arguments:
X (pd.DataFrame): Data to transform
y (pd.Series, optional): Targets
Returns:
pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the
format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X_t = X

for col in self._text_col_names:
transformed = self._lsa_pipeline.transform(X[col])

X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0])
X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1])
eccabay marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not blocking: what do you think of doing this for the naming: LSA(my_feature, 0) and LSA(my_feature, 1) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like it! I only kept this formatting to mirror what the primitives' generated column names look like, but I can change this if you'd prefer.

X_t = X_t.drop(columns=self._text_col_names)
return X_t
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import logging
import string
import warnings

import pandas as pd

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.transformers.preprocessing import LSA
from evalml.utils import import_or_raise

logger = logging.getLogger()


class TextFeaturizer(Transformer):
"""Transformer that can automatically featurize text columns."""
Expand All @@ -16,24 +19,20 @@ def __init__(self, text_columns=None, random_state=0, **kwargs):
"""Extracts features from text columns using featuretools' nlp_primitives

Arguments:
text_colums (list): list of `pd.DataFrame` column names that contain text.
text_columns (list): list of feature names which should be treated as text features.
random_state (int, np.random.RandomState): Seed for the random number generator.

"""
self._ft = import_or_raise("featuretools", error_msg="Package featuretools is not installed. Please install using `pip install featuretools[nlp_primitives].`")
self._nlp_primitives = import_or_raise("nlp_primitives", error_msg="Package nlp_primitives is not installed. Please install using `pip install featuretools[nlp_primitives].`")

text_columns = text_columns or []
parameters = {'text_columns': text_columns}
text_columns = text_columns or []
parameters.update(kwargs)

if len(text_columns) == 0:
warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning)
Comment on lines -30 to -31
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this warning from __init__ to fit to temporarily resolve #1017

for i, col_name in enumerate(text_columns):
if not isinstance(col_name, str):
text_columns[i] = str(col_name)
self.text_col_names = text_columns
self._features = None
self._lsa = LSA(text_columns=text_columns, random_state=random_state)
eccabay marked this conversation as resolved.
Show resolved Hide resolved
self._text_col_names = text_columns
super().__init__(parameters=parameters,
component_obj=None,
random_state=random_state)
Expand All @@ -44,50 +43,47 @@ def normalize(text):
text = text.translate(str.maketrans('', '', string.punctuation))
return text.lower()

for text_col in self.text_col_names:
for text_col in self._text_col_names:
X[text_col] = X[text_col].apply(normalize)
return X

def _verify_col_names(self, col_names):
missing_cols = []
for col in self.text_col_names:
if col not in col_names:
missing_cols.append(col)
missing_cols = [col for col in self._text_col_names if col not in col_names]

if len(missing_cols) > 0:
if len(missing_cols) == len(self.text_col_names):
if len(missing_cols) == len(self._text_col_names):
raise RuntimeError("None of the provided text column names match the columns in the given DataFrame")
for col in missing_cols:
self.text_col_names.remove(col)
warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning)
self._text_col_names.remove(col)
logger.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols))

def _verify_col_types(self, entity_set):
var_types = entity_set.entities[0].variable_types
for col in self.text_col_names:
if var_types[col] is not self._ft.variable_types.variable.Text:
for col in self._text_col_names:
if var_types[str(col)] is not self._ft.variable_types.variable.Text:
raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col))

def fit(self, X, y=None):
if len(self.text_col_names) == 0:
if len(self._text_col_names) == 0:
self._features = []
return self
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X).rename(columns=str)
X = pd.DataFrame(X)
self._verify_col_names(X.columns)
X_text = X[self.text_col_names]
X_text = X[self._text_col_names]
X_text['index'] = range(len(X_text))

es = self._ft.EntitySet()
es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index')
self._verify_col_types(es)
es.df = self._clean_text(X)

trans = [self._nlp_primitives.DiversityScore,
self._nlp_primitives.LSA,
self._nlp_primitives.MeanCharactersPerWord,
self._nlp_primitives.PartOfSpeechCount,
self._nlp_primitives.PolarityScore]

self._lsa.fit(X)
self._features = self._ft.dfs(entityset=es,
target_entity='X',
trans_primitives=trans,
Expand All @@ -108,20 +104,21 @@ def transform(self, X, y=None):
X = pd.DataFrame(X)
if self._features is None or len(self._features) == 0:
return X
X = X.rename(columns=str)
self._verify_col_names(X.columns)

X_text = X[self.text_col_names]
X_text = X[self._text_col_names]
X_lsa = self._lsa.transform(X_text)

X_text['index'] = range(len(X_text))
X_t = X.drop(self.text_col_names, axis=1)
X_t = X.drop(self._text_col_names, axis=1)

es = self._ft.EntitySet()
es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index')
es = es.entity_from_dataframe(entity_id='X', dataframe=X_text.rename(columns=str), index='index')
self._verify_col_types(es)
es.df = self._clean_text(X)

feature_matrix = self._ft.calculate_feature_matrix(features=self._features,
entityset=es,
verbose=True)
X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1)
X_t = pd.concat([X_t, feature_matrix.reindex(X.index), X_lsa], axis=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eccabay why is the feature_matrix.reindex(X.index) necessary? I know that's not part of this PR, I'm just poking around finding ways we can simplify our row/column indexing across the board.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I recall correctly, it was because the outputted feature_matrix sets its own indices, so it helped to reset to what was originally given.

return X_t
Loading