alteryx · ParthivNaresh · Mar 7, 2023 · Feb 14, 2023 · Feb 14, 2023 · Feb 14, 2023
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -6,6 +6,7 @@ Release Notes
 Future Release
 ==============
     * Enhancements
+        * Improved inference for numeric logical types to handle incoming ``object`` dtype data (:pr:`1645`)
         * Updated datetime format inference to handle years represented by 2 digits (:pr:`1632`)
     * Fixes
     * Changes

diff --git a/woodwork/config.py b/woodwork/config.py
@@ -24,6 +24,7 @@
     "postal_code_inference_regex": r"^[0-9]{5}(?:-[0-9]{4})?$",
     "nan_values": [
         "",
+        " ",
         None,
         np.nan,
         pd.NaT,

diff --git a/woodwork/statistics_utils/_get_box_plot_info_for_column.py b/woodwork/statistics_utils/_get_box_plot_info_for_column.py
@@ -59,7 +59,11 @@ def _determine_coefficients(series, mc):
             method (str): Name of the outlier method to use.
             mc (float): The medcouple statistic (if the method chosen is medcouple, otherwise None).
     """
-    coeff = np.abs(skew(series))
+    try:
+        coeff = np.abs(skew(series))
+    except ValueError:
+        # skew can't handle Int64 dtype
+        coeff = np.abs(skew(series.astype("float64")))
     coeff = min(coeff, 3.5)
     if mc >= 0:
         return -coeff, coeff

diff --git a/woodwork/tests/accessor/test_statistics.py b/woodwork/tests/accessor/test_statistics.py
@@ -1978,6 +1978,15 @@ def test_medcouple_outliers(skewed_outliers_df):
     assert right_skewed_dict == expected_right_skewed_dict
     assert left_skewed_dict == expected_left_skewed_dict
 
+    outliers_series_skewed_right = skewed_outliers_df[
+        "right_skewed_outliers_nullable_int"
+    ]
+    outliers_series_skewed_right.ww.init(logical_type="IntegerNullable")
+
+    right_skewed_dict = outliers_series_skewed_right.ww.medcouple_dict()
+
+    assert right_skewed_dict == expected_right_skewed_dict
+
 
 def test_medcouple_outliers_with_quantiles(skewed_outliers_df):
     outliers_series_skewed_right = skewed_outliers_df["right_skewed_outliers"]

diff --git a/woodwork/tests/conftest.py b/woodwork/tests/conftest.py
@@ -109,6 +109,90 @@ def sample_df_spark(sample_df_pandas):
     return ps.from_pandas(sample_df_pandas)
 
 
+@pytest.fixture(
+    params=[
+        "comprehensive_df_pandas",
+        "comprehensive_df_dask",
+        "comprehensive_df_spark",
+    ],
+)
+def comprehensive_df(request):
+    return request.getfixturevalue(request.param)
+
+
+@pytest.fixture()
+def comprehensive_df_pandas():
+    df = pd.DataFrame()
+    df["ints"] = np.random.choice([i for i in range(-100, 100)], 1_000)
+    df["ints_str"] = np.random.choice([f"{i}" for i in range(-100, 100)], 1_000)
+    df["ints_null"] = np.random.choice(
+        [i for i in range(-50, 50)] + [pd.NA, np.nan, None],
+        1_000,
+    )
+    df["ints_null_str"] = np.random.choice(
+        [f"{i}" for i in range(-50, 50)] + [pd.NA, np.nan, None],
+        1_000,
+    )
+    df["floats"] = np.random.choice([i * 1.1 for i in range(-100, 100)], 1_000)
+    df["floats_str"] = np.random.choice([f"{i * 1.1}" for i in range(-100, 100)], 1_000)
+    df["floats_null"] = np.random.choice(
+        [i * 1.1 for i in range(-50, 50)] + [pd.NA, np.nan, None],
+        1_000,
+    )
+    df["floats_null_str"] = np.random.choice(
+        [f"{i * 1.1}" for i in range(-50, 50)] + [pd.NA, np.nan, None],
+        1_000,
+    )
+    df["int_float_mixed"] = np.random.choice(
+        [f"{i}" for i in range(-50, 50)] + ["3.14"],
+        1_000,
+    )
+    df["int_float_mixed_null"] = np.random.choice(
+        [f"{i}" for i in range(-50, 50)] + ["3.14", pd.NA, np.nan, None],
+        1_000,
+    )
+    df["bools"] = np.random.choice([True, False], 1_000)
+    df["bools_str"] = np.random.choice(["y", "n"], 1_000)
+    df["bools_null"] = np.random.choice([True, False, pd.NA], 1_000)
+    df["bools_null_str"] = np.random.choice(["y", "n", pd.NA], 1_000)
+    df["datetimes"] = pd.date_range("01/01/1995", freq="3D", periods=1_000)
+    df["datetimes_str"] = [
+        "01-05-12",
+        "01-11-04",
+        "03-21-11",
+        "11-01-19",
+        "12-28-01",
+    ] * 200
+    df["datetimes_null_str"] = [
+        "01-05-12",
+        "01-11-04",
+        "03-21-11",
+        "11-01-19",
+        "12-28-01",
+        "04-21-15",
+        "06-20-98",
+        "10-09-99",
+        "01-03-00",
+        pd.NA,
+    ] * 100
+    return df
+
+
+@pytest.fixture()
+def comprehensive_df_dask(comprehensive_df_pandas):
+    dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping")
+    return dd.from_pandas(comprehensive_df_pandas, npartitions=1)
+
+
+@pytest.fixture()
+def comprehensive_df_spark(comprehensive_df_pandas):
+    ps = pytest.importorskip(
+        "pyspark.pandas",
+        reason="Pyspark pandas not installed, skipping",
+    )
+    return ps.from_pandas(comprehensive_df_pandas)
+
+
 @pytest.fixture()
 def sample_df_phone_numbers():
     return pd.DataFrame(
@@ -1008,6 +1092,17 @@ def skewed_outliers_df_pandas():
             + [6] * 5
             + [7] * 3
             + [8, 8, 9, 9, 10, 11, 13, 14, 16, 30],
+            "right_skewed_outliers_nullable_int": pd.Series(
+                [1] * 2
+                + [2] * 6
+                + [3] * 20
+                + [4] * 12
+                + [5] * 8
+                + [6] * 5
+                + [7] * 3
+                + [8, 8, 9, 9, 10, 11, 13, 14, 16, 30],
+                dtype="Int64",
+            ),
             "no_outliers": [60, 42, 37, 23, 49, 42, 36, 57, 60, 23.0] * 6
             + [35, 54, 43, 47, 41, 39],
             "non_numeric": ["a"] * 66,

diff --git a/woodwork/tests/logical_types/test_logical_types.py b/woodwork/tests/logical_types/test_logical_types.py
@@ -6,7 +6,13 @@
 import pandas as pd
 import pytest
 
-from woodwork.accessor_utils import _is_dask_series, _is_spark_series, init_series
+from woodwork.accessor_utils import (
+    _is_dask_dataframe,
+    _is_dask_series,
+    _is_spark_dataframe,
+    _is_spark_series,
+    init_series,
+)
 from woodwork.config import config
 from woodwork.exceptions import (
     TypeConversionError,
@@ -30,6 +36,7 @@
     Ordinal,
     PhoneNumber,
     PostalCode,
+    Unknown,
     _replace_nans,
 )
 from woodwork.tests.testing_utils.table_utils import (
@@ -729,7 +736,7 @@ def test_integer_nullable(data_type, null_type):
 
 @pytest.mark.parametrize(
     "null_type",
-    [None, pd.NA, pd.NaT, np.nan, "null", "N/A", "mix", True],
+    [None, pd.NA, pd.NaT, np.nan, "null", " ", "N/A", "mix", True],
 )
 def test_boolean_nullable(null_type):
     nullable_bools = pd.DataFrame([True, False] * 50, columns=["bool_nulls"])
@@ -955,7 +962,8 @@ def test_datetime_formats_two_digit_years(datetime_different_formats):
         final_format = "%Y-%m-%d %H:%M:%S" if "%H:%M:%S" in format_ else "%Y-%m-%d"
         expected_values = [
             datetime.strptime(
-                starting_date_.replace("24", str(each)), format_
+                starting_date_.replace("24", str(each)),
+                format_,
             ).strftime(final_format)
             for each in range(24, 90, 4)
         ]
@@ -1164,3 +1172,48 @@ def test_coerce_boolean_not_called_for_bool_dtype(coerce_boolean_patch):
     series_init = init_series(series)
     assert not coerce_boolean_patch.called
     assert series_init.dtype == "bool"
+
+
+def test_object_dtype_inference(comprehensive_df):
+    expected = {
+        "ints": "Integer",
+        "ints_str": "Integer",
+        "ints_null": "IntegerNullable",
+        "ints_null_str": "IntegerNullable",
+        "floats": "Double",
+        "floats_str": "Double",
+        "floats_null": "Double",
+        "floats_null_str": "Double",
+        "int_float_mixed": "Double",
+        "int_float_mixed_null": "Double",
+        "bools": "Boolean",
+        "bools_str": "Boolean",
+        "bools_null": "BooleanNullable",
+        "bools_null_str": "BooleanNullable",
+        "datetimes": "Datetime",
+        "datetimes_str": "Datetime",
+        "datetimes_null_str": "Datetime",
+    }
+    df_copy = comprehensive_df.copy()
+    df_copy_objects = comprehensive_df.copy()
+    df_copy.ww.init()
+    df_copy_objects.ww.init(
+        logical_types={col: Unknown for col in df_copy_objects.columns},
+    )
+    if _is_dask_dataframe(df_copy):
+        df_copy = df_copy.ww.compute()
+        df_copy_objects = df_copy_objects.ww.compute()
+    elif _is_spark_dataframe(df_copy):
+        df_copy = df_copy.ww.to_pandas()
+        df_copy_objects = df_copy_objects.ww.to_pandas()
+    # Confirm proper Woodwork inference for pandas-inferred object columns
+    assert {
+        col: str(ltype) for col, ltype in df_copy.ww.logical_types.items()
+    } == expected
+    for col in df_copy_objects:
+        df_copy_objects[col] = df_copy_objects[col].astype("object")
+    df_copy_objects.ww.init()
+    # Confirm proper Woodwork inference when every column is converted to string and then cast to object
+    assert {
+        col: str(ltype) for col, ltype in df_copy_objects.ww.logical_types.items()
+    } == expected
diff --git a/woodwork/tests/type_system/test_ltype_inference.py b/woodwork/tests/type_system/test_ltype_inference.py
@@ -129,8 +129,11 @@ def test_categorical_inference(categories):
         if ind == len(categories) - 1:
             dtypes = ["string", "category"]
         for dtype in dtypes:
+            expected_ltype = Categorical
+            if ind in [1, 3] and dtype == "object":
+                expected_ltype = Integer
             inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
-            assert isinstance(inferred_type, Categorical)
+            assert isinstance(inferred_type, expected_ltype)
 
 
 def test_postal_inference(postal):

diff --git a/woodwork/tests/utils/test_read_file.py b/woodwork/tests/utils/test_read_file.py
@@ -262,7 +262,8 @@ def test_replace_nan_strings_with_read_file(tmpdir):
         filepath=filepath,
         replace_nan=False,
     )
-    assert actual.isnull().sum().sum() == 1
+
+    assert actual.isnull().sum().sum() == 3
 
     # With replacement
     actual = ww.read_file(

diff --git a/woodwork/tests/utils/test_utils.py b/woodwork/tests/utils/test_utils.py
@@ -191,7 +191,7 @@ def test_is_numeric_datetime_series(time_index_df):
     assert not _is_numeric_series(time_index_df["ints"], Categorical)
     assert _is_numeric_series(time_index_df["ints"], Datetime)
 
-    assert not _is_numeric_series(time_index_df["strs"], None)
+    assert _is_numeric_series(time_index_df["strs"], Integer)
     assert not _is_numeric_series(time_index_df["strs"], "Categorical")
     assert not _is_numeric_series(time_index_df["strs"], Categorical)
     assert _is_numeric_series(time_index_df["strs"], Double)

diff --git a/woodwork/type_sys/inference_functions.py b/woodwork/type_sys/inference_functions.py
@@ -46,6 +46,8 @@ def categorical_func(series):
 
 def integer_func(series):
     if integer_nullable_func(series) and not series.isnull().any():
+        if pdtypes.is_object_dtype(series.dtype):
+            return True
         return all(series.mod(1).eq(0))
     return False
 
@@ -66,6 +68,14 @@ def _is_valid_int(value):
             return False
         series_no_null = series.dropna()
         return all([_is_valid_int(v) for v in series_no_null])
+    elif pdtypes.is_object_dtype(series.dtype):
+        series_no_null = series.dropna()
+        try:
+            return series_no_null.map(
+                lambda x: (isinstance(x, str) and isinstance(int(x), int)),
+            ).all()
+        except ValueError:
+            return False
 
     return False
 
@@ -77,6 +87,15 @@ def double_func(series):
             return not _is_categorical_series(series, threshold)
         else:
             return True
+    elif pdtypes.is_object_dtype(series.dtype):
+        series_no_null = series.dropna()
+        try:
+            # If str and casting to float works, make sure that it isn't just an integer
+            return series_no_null.map(
+                lambda x: isinstance(x, str) and not float(x).is_integer(),
+            ).any()
+        except ValueError:
+            return False
 
     return False
 

diff --git a/woodwork/type_sys/type_system.py b/woodwork/type_sys/type_system.py
@@ -93,7 +93,7 @@
 
 DEFAULT_TYPE = Unknown
 
-INFERENCE_SAMPLE_SIZE = 100000
+INFERENCE_SAMPLE_SIZE = 100_000
 
 
 class TypeSystem(object):
@@ -383,6 +383,8 @@ def get_inference_matches(types_to_check, series, type_matches=[]):
                 Categorical in type_matches or Double in type_matches
             ) and IntegerNullable in type_matches:
                 best_match = IntegerNullable
+            elif Categorical in type_matches and Double in type_matches:
+                best_match = Double
             else:
                 best_match = type_matches[0]
             best_depth = self._get_depth(best_match)

diff --git a/woodwork/utils.py b/woodwork/utils.py
@@ -678,7 +678,9 @@ def _infer_datetime_format(dates, n=100):
 
 def _parse_latlong(latlong):
     nan_values_strs = [
-        x for x in ww.config.get_option("nan_values") if isinstance(x, str) and len(x)
+        x
+        for x in ww.config.get_option("nan_values")
+        if isinstance(x, str) and len(x) and x != " "
     ]
     nan_values = "|".join(nan_values_strs)