alteryx · gsheni · Sep 25, 2020 · Sep 17, 2020 · Sep 17, 2020 · Sep 17, 2020
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -166,8 +166,8 @@ Cumulative Transform Primitives
     CumMin
     CumMax
 
-Text Transform Primitives
-*************************
+NaturalLanguage Transform Primitives
+************************************
 .. autosummary::
    :toctree: generated/
 
@@ -345,7 +345,7 @@ Variable types
     Categorical
     Ordinal
     Boolean
-    Text
+    NaturalLanguage
     LatLong
     ZIPCode
     IPAddress

diff --git a/docs/source/automated_feature_engineering/primitives.rst b/docs/source/automated_feature_engineering/primitives.rst
@@ -112,7 +112,7 @@ Simple Custom Primitives
 .. ipython :: python
 
     from featuretools.primitives import make_agg_primitive, make_trans_primitive
-    from featuretools.variable_types import Text, Numeric
+    from featuretools.variable_types import NaturalLanguage, Numeric
 
     def absolute(column):
         return abs(column)
@@ -171,7 +171,7 @@ Next, we need to create a custom primitive from the ``word_count`` function.
 .. ipython :: python
 
     WordCount = make_trans_primitive(function=word_count,
-                                     input_types=[Text],
+                                     input_types=[NaturalLanguage],
                                      return_type=Numeric)
 
 .. ipython :: python

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -3,18 +3,22 @@
 Changelog
 ---------
 **Future Release**
+    .. warning::
+        The Text variable type has been deprecated and been replaced with the NaturalLanguage variable type. The Text variable type will be removed in a future release.
+
     * Enhancements
     * Fixes
         * Allow FeatureOutputSlice features to be serialized (:pr:`1150`)
         * Fix duplicate label column generation when labels are passed in cutoff times and approximate is being used (:pr:`1160`)
     * Changes
+        * Text variable type has been replaced with NaturalLanguage (:pr:`1159`)
     * Documentation Changes
         * Update release doc for clarity and to add Future Release template (:pr:`1151`)
     * Testing Changes
         * Stop requiring single-threaded dask scheduler in tests (:pr:`1163`)
 
     Thanks to the following people for contributing to this release:
-    :user:`rwedge`, :user:`tamargrey`, :user: `tuethan1999`
+    :user:`rwedge`, :user:`tamargrey`, :user: `tuethan1999`, :user:`gsheni`
 
 **v0.19.0 Sept 8, 2020**
     * Enhancements
@@ -30,8 +34,8 @@ Changelog
         * Added return values to dfs and calculate_feature_matrix (:pr:`1125`)
     * Testing Changes
         * Better test case for normalizing from no time index to time index (:pr:`1113`)
-     
-    \* When passing multiple instances of a primitive built with ``make_trans_primitive`` 
+
+    \* When passing multiple instances of a primitive built with ``make_trans_primitive``
     or ``maxe_agg_primitive``, those instances must have the same relative order when passed
     to ``dfs`` to ensure a consistent ordering of features.
 
@@ -41,9 +45,9 @@ Changelog
 
 **Breaking Changes**
 
-* ``ft.dfs`` will no longer build features from Transform primitives where one 
-  of the inputs is a Transform feature, a GroupByTransform feature, 
-  or a Direct Feature of a Transform / GroupByTransform feature. This will make some 
+* ``ft.dfs`` will no longer build features from Transform primitives where one
+  of the inputs is a Transform feature, a GroupByTransform feature,
+  or a Direct Feature of a Transform / GroupByTransform feature. This will make some
   features that would previously be generated by ``ft.dfs`` only possible if
   explicitly specified in ``seed_features``.
 

diff --git a/docs/source/guides/advanced_custom_primitives.rst b/docs/source/guides/advanced_custom_primitives.rst
@@ -8,9 +8,9 @@ Functions With Additional Arguments
 
     import featuretools as ft
     from featuretools.primitives import make_trans_primitive
-    from featuretools.variable_types import Text, Numeric, Categorical
+    from featuretools.variable_types import NaturalLanguage, Numeric, Categorical
 
-One caveat with the make\_primitive functions is that the required arguments of ``function`` must be input features.  Here we create a function for ``StringCount``, a primitive which counts the number of occurrences of a string in a ``Text`` input.  Since ``string`` is not a feature, it needs to be a keyword argument to ``string_count``.
+One caveat with the make\_primitive functions is that the required arguments of ``function`` must be input features.  Here we create a function for ``StringCount``, a primitive which counts the number of occurrences of a string in a ``NaturalLanguage`` input.  Since ``string`` is not a feature, it needs to be a keyword argument to ``string_count``.
 
 .. ipython:: python
 
@@ -34,7 +34,7 @@ Now that we have the function, we create the primitive using the ``make_trans_pr
 .. ipython:: python
 
     StringCount = make_trans_primitive(function=string_count,
-                                       input_types=[Text],
+                                       input_types=[NaturalLanguage],
                                        return_type=Numeric,
                                        cls_attributes={"generate_name": string_count_generate_name})
 
@@ -63,7 +63,7 @@ Features with Multiple Outputs
     import numpy as np
     import re
     from featuretools.primitives import make_trans_primitive
-    from featuretools.variable_types import Text, Numeric
+    from featuretools.variable_types import NaturalLanguage, Numeric
 
 With the ``make_primitive`` functions, it is possible to have multiple columns output from a single feature. In order to do that, the output must be formatted as a list of arrays/series where each item in the list corresponds to an output from the primitive. In each of these list items (either arrays or series), there must be one element for each input element.
 
@@ -84,7 +84,7 @@ We must use the ``num_output_features`` attribute to specify the number of outpu
 .. ipython:: python
 
     CaseCount = make_trans_primitive(function=case_count,
-                                       input_types=[Text],
+                                       input_types=[NaturalLanguage],
                                        return_type=Numeric,
                                        number_output_features=2)
 
@@ -110,9 +110,9 @@ When we call ``dfs`` on this entityset, there are 6 instances (one for each of t
 ..     from featuretools.primitives import TransformPrimitive
 
 ..     class Sentiment(TransformPrimitive):
-..         '''Reads in a text field and returns "negative", "neutral", or "positive"'''
+..         '''Reads in a NaturalLanguage field and returns "negative", "neutral", or "positive"'''
 ..         name = "sentiment"
-..         input_types = [Text]
+..         input_types = [NaturalLanguage]
 ..         return_type = Categorical
 ..         def get_function(self):
 ..             filepath = self.get_filepath('sentiment_model.pickle') # returns absolute path to the file

diff --git a/featuretools/demo/retail.py b/featuretools/demo/retail.py
@@ -79,7 +79,7 @@ def load_retail(id='demo_retail_data', nrows=None, return_single_table=False):
                              index="order_product_id",
                              make_index=True,
                              time_index="order_date",
-                             variable_types={'description': vtypes.Text})
+                             variable_types={'description': vtypes.NaturalLanguage})
 
     es.normalize_entity(new_entity_id="products",
                         base_entity_id="order_products",

diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
@@ -20,7 +20,7 @@
     _check_timedelta,
     _dataframes_equal
 )
-from featuretools.variable_types import find_variable_types
+from featuretools.variable_types import Text, find_variable_types
 
 ks = import_or_none('databricks.koalas')
 
@@ -294,6 +294,8 @@ def _create_variables(self, variable_types, index, time_index, secondary_time_in
         variables = []
         variable_types = variable_types.copy() or {}
         string_to_class_map = find_variable_types()
+        # TODO: Remove once Text has been removed from variable types
+        string_to_class_map[Text.type_string] = Text
         for vid in variable_types.copy():
             vtype = variable_types[vid]
             if isinstance(vtype, str):

diff --git a/featuretools/entityset/serialize.py b/featuretools/entityset/serialize.py
@@ -13,7 +13,7 @@
 ks = import_or_none('databricks.koalas')
 
 FORMATS = ['csv', 'pickle', 'parquet']
-SCHEMA_VERSION = "4.0.0"
+SCHEMA_VERSION = "5.0.0"
 
 
 def entity_to_description(entity):

diff --git a/featuretools/feature_base/features_serializer.py b/featuretools/feature_base/features_serializer.py
@@ -7,7 +7,7 @@
 from featuretools.utils.wrangle import _is_s3, _is_url
 from featuretools.version import __version__ as ft_version
 
-SCHEMA_VERSION = "5.0.0"
+SCHEMA_VERSION = "6.0.0"
 
 
 def save_features(features, location=None, profile_name=None):

diff --git a/featuretools/primitives/standard/transform_primitive.py b/featuretools/primitives/standard/transform_primitive.py
@@ -14,9 +14,9 @@
     Datetime,
     DatetimeTimeIndex,
     LatLong,
+    NaturalLanguage,
     Numeric,
     Ordinal,
-    Text,
     Variable
 )
 
@@ -326,7 +326,7 @@ class NumCharacters(TransformPrimitive):
         [16, 11, 6]
     """
     name = 'num_characters'
-    input_types = [Text]
+    input_types = [NaturalLanguage]
     return_type = Numeric
     compatibility = [Library.PANDAS, Library.DASK, Library.KOALAS]
 
@@ -348,7 +348,7 @@ class NumWords(TransformPrimitive):
         [4, 2, 1, 6]
     """
     name = 'num_words'
-    input_types = [Text]
+    input_types = [NaturalLanguage]
     return_type = Numeric
     compatibility = [Library.PANDAS, Library.DASK, Library.KOALAS]
 

diff --git a/featuretools/tests/entityset_tests/test_dask_es.py b/featuretools/tests/entityset_tests/test_dask_es.py
@@ -85,7 +85,7 @@ def test_add_last_time_indexes():
         "id": ft.variable_types.Id,
         "user": ft.variable_types.Id,
         "time": ft.variable_types.DatetimeTimeIndex,
-        "strings": ft.variable_types.Text
+        "strings": ft.variable_types.NaturalLanguage
     }
 
     transactions = pd.DataFrame({"id": [0, 1, 2, 3, 4, 5],
@@ -159,7 +159,7 @@ def test_single_table_dask_entityset():
         "id": ft.variable_types.Id,
         "values": ft.variable_types.Numeric,
         "dates": ft.variable_types.Datetime,
-        "strings": ft.variable_types.Text
+        "strings": ft.variable_types.NaturalLanguage
     }
     dask_es.entity_from_dataframe(entity_id="data",
                                   dataframe=values_dd,
@@ -174,7 +174,7 @@ def test_single_table_dask_entityset():
     pd_es.entity_from_dataframe(entity_id="data",
                                 dataframe=df,
                                 index="id",
-                                variable_types={"strings": ft.variable_types.Text})
+                                variable_types={"strings": ft.variable_types.NaturalLanguage})
 
     fm, _ = ft.dfs(entityset=pd_es,
                    target_entity="data",
@@ -204,7 +204,7 @@ def test_single_table_dask_entityset_ids_not_sorted():
         "id": ft.variable_types.Id,
         "values": ft.variable_types.Numeric,
         "dates": ft.variable_types.Datetime,
-        "strings": ft.variable_types.Text
+        "strings": ft.variable_types.NaturalLanguage
     }
     dask_es.entity_from_dataframe(entity_id="data",
                                   dataframe=values_dd,
@@ -219,7 +219,7 @@ def test_single_table_dask_entityset_ids_not_sorted():
     pd_es.entity_from_dataframe(entity_id="data",
                                 dataframe=df,
                                 index="id",
-                                variable_types={"strings": ft.variable_types.Text})
+                                variable_types={"strings": ft.variable_types.NaturalLanguage})
 
     fm, _ = ft.dfs(entityset=pd_es,
                    target_entity="data",
@@ -250,7 +250,7 @@ def test_single_table_dask_entityset_with_instance_ids():
         "id": ft.variable_types.Id,
         "values": ft.variable_types.Numeric,
         "dates": ft.variable_types.Datetime,
-        "strings": ft.variable_types.Text
+        "strings": ft.variable_types.NaturalLanguage
     }
     dask_es.entity_from_dataframe(entity_id="data",
                                   dataframe=values_dd,
@@ -266,7 +266,7 @@ def test_single_table_dask_entityset_with_instance_ids():
     pd_es.entity_from_dataframe(entity_id="data",
                                 dataframe=df,
                                 index="id",
-                                variable_types={"strings": ft.variable_types.Text})
+                                variable_types={"strings": ft.variable_types.NaturalLanguage})
 
     fm, _ = ft.dfs(entityset=pd_es,
                    target_entity="data",
@@ -296,7 +296,7 @@ def test_single_table_dask_entityset_single_cutoff_time():
         "id": ft.variable_types.Id,
         "values": ft.variable_types.Numeric,
         "dates": ft.variable_types.Datetime,
-        "strings": ft.variable_types.Text
+        "strings": ft.variable_types.NaturalLanguage
     }
     dask_es.entity_from_dataframe(entity_id="data",
                                   dataframe=values_dd,
@@ -312,7 +312,7 @@ def test_single_table_dask_entityset_single_cutoff_time():
     pd_es.entity_from_dataframe(entity_id="data",
                                 dataframe=df,
                                 index="id",
-                                variable_types={"strings": ft.variable_types.Text})
+                                variable_types={"strings": ft.variable_types.NaturalLanguage})
 
     fm, _ = ft.dfs(entityset=pd_es,
                    target_entity="data",
@@ -340,7 +340,7 @@ def test_single_table_dask_entityset_cutoff_time_df():
         "id": ft.variable_types.Id,
         "values": ft.variable_types.Numeric,
         "dates": ft.variable_types.DatetimeTimeIndex,
-        "strings": ft.variable_types.Text
+        "strings": ft.variable_types.NaturalLanguage
     }
     dask_es.entity_from_dataframe(entity_id="data",
                                   dataframe=values_dd,
@@ -365,7 +365,7 @@ def test_single_table_dask_entityset_cutoff_time_df():
                                 dataframe=df,
                                 index="id",
                                 time_index="dates",
-                                variable_types={"strings": ft.variable_types.Text})
+                                variable_types={"strings": ft.variable_types.NaturalLanguage})
 
     fm, _ = ft.dfs(entityset=pd_es,
                    target_entity="data",

diff --git a/featuretools/tests/entityset_tests/test_entity.py b/featuretools/tests/entityset_tests/test_entity.py
@@ -259,12 +259,12 @@ def test_passing_strings_to_variable_types_dfs():
         'home_team_score': [3, 0, 1, 0, 4],
         'away_team_score': [2, 1, 2, 0, 0]
     })
-    entities = {'teams': (teams, 'id', None, {'name': 'text'}), 'games': (games, 'id')}
+    entities = {'teams': (teams, 'id', None, {'name': 'natural_language'}), 'games': (games, 'id')}
     relationships = [('teams', 'id', 'games', 'home_team_id')]
 
     features = ft.dfs(entities, relationships, target_entity="teams", features_only=True)
     name_class = features[0].entity['name'].__class__
-    assert name_class == variable_types['text']
+    assert name_class == variable_types['natural_language']
 
 
 def test_replace_latlong_nan_during_entity_creation(pd_es):
@@ -275,3 +275,24 @@ def test_replace_latlong_nan_during_entity_creation(pd_es):
     with pytest.warns(UserWarning, match="LatLong columns should contain only tuples. All single 'NaN' values in column 'latlong' have been replaced with '\\(NaN, NaN\\)'."):
         entity = ft.Entity(id="nan_latlong_entity", df=df, entityset=nan_es, variable_types=pd_es['log'].variable_types)
     assert entity.df['latlong'][0] == (np.nan, np.nan)
+
+
+def test_text_deprecation_warning():
+    data = pd.DataFrame({
+        "id": [1, 2, 3, 4, 5],
+        "value": ["a", "c", "b", "a", "a"]
+    })
+
+    for text_repr in ['text', ft.variable_types.Text]:
+        es = ft.EntitySet()
+        match = "Text has been deprecated. Please use NaturalLanguage instead."
+        with pytest.warns(FutureWarning, match=match):
+            es = es.entity_from_dataframe(entity_id="test", dataframe=data, index="id",
+                                          variable_types={"value": text_repr})
+
+    for nl_repr in ['natural_language', ft.variable_types.NaturalLanguage]:
+        es = ft.EntitySet()
+        with pytest.warns(None) as record:
+            es = es.entity_from_dataframe(entity_id="test", dataframe=data, index="id",
+                                          variable_types={"value": nl_repr})
+        assert len(record) == 0
diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py
@@ -133,7 +133,7 @@ def test_add_relationship_errors_on_dtype_mismatch(es):
         'value_many_nans': variable_types.Numeric,
         'priority_level': variable_types.Ordinal,
         'purchased': variable_types.Boolean,
-        'comments': variable_types.Text
+        'comments': variable_types.NaturalLanguage
     }
     assert set(log_variable_types) == set(log_2_df.columns)
     es.entity_from_dataframe(entity_id='log2',