Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle incoming Object dtype data #1645

Merged
merged 12 commits into from
Mar 7, 2023
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Release Notes
Future Release
==============
* Enhancements
* Improved inference for numeric logical types to handle incoming ``object`` dtype data (:pr:`1645`)
* Updated datetime format inference to handle years represented by 2 digits (:pr:`1632`)
* Fixes
* Changes
Expand Down
1 change: 1 addition & 0 deletions woodwork/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"postal_code_inference_regex": r"^[0-9]{5}(?:-[0-9]{4})?$",
"nan_values": [
"",
" ",
None,
np.nan,
pd.NaT,
Expand Down
6 changes: 5 additions & 1 deletion woodwork/statistics_utils/_get_box_plot_info_for_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,11 @@ def _determine_coefficients(series, mc):
method (str): Name of the outlier method to use.
mc (float): The medcouple statistic (if the method chosen is medcouple, otherwise None).
"""
coeff = np.abs(skew(series))
try:
coeff = np.abs(skew(series))
except ValueError:
# skew can't handle Int64 dtype
coeff = np.abs(skew(series.astype("float64")))
Copy link
Collaborator Author

@ParthivNaresh ParthivNaresh Feb 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the cause of the current LG ww perf issue

coeff = min(coeff, 3.5)
if mc >= 0:
return -coeff, coeff
Expand Down
9 changes: 9 additions & 0 deletions woodwork/tests/accessor/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1978,6 +1978,15 @@ def test_medcouple_outliers(skewed_outliers_df):
assert right_skewed_dict == expected_right_skewed_dict
assert left_skewed_dict == expected_left_skewed_dict

outliers_series_skewed_right = skewed_outliers_df[
"right_skewed_outliers_nullable_int"
]
outliers_series_skewed_right.ww.init(logical_type="IntegerNullable")

right_skewed_dict = outliers_series_skewed_right.ww.medcouple_dict()

assert right_skewed_dict == expected_right_skewed_dict
simha104 marked this conversation as resolved.
Show resolved Hide resolved


def test_medcouple_outliers_with_quantiles(skewed_outliers_df):
outliers_series_skewed_right = skewed_outliers_df["right_skewed_outliers"]
Expand Down
95 changes: 95 additions & 0 deletions woodwork/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,90 @@ def sample_df_spark(sample_df_pandas):
return ps.from_pandas(sample_df_pandas)


@pytest.fixture(
params=[
"comprehensive_df_pandas",
"comprehensive_df_dask",
"comprehensive_df_spark",
],
)
def comprehensive_df(request):
return request.getfixturevalue(request.param)


@pytest.fixture()
def comprehensive_df_pandas():
df = pd.DataFrame()
df["ints"] = np.random.choice([i for i in range(-100, 100)], 1_000)
df["ints_str"] = np.random.choice([f"{i}" for i in range(-100, 100)], 1_000)
df["ints_null"] = np.random.choice(
[i for i in range(-50, 50)] + [pd.NA, np.nan, None],
1_000,
)
df["ints_null_str"] = np.random.choice(
[f"{i}" for i in range(-50, 50)] + [pd.NA, np.nan, None],
1_000,
)
df["floats"] = np.random.choice([i * 1.1 for i in range(-100, 100)], 1_000)
df["floats_str"] = np.random.choice([f"{i * 1.1}" for i in range(-100, 100)], 1_000)
df["floats_null"] = np.random.choice(
[i * 1.1 for i in range(-50, 50)] + [pd.NA, np.nan, None],
1_000,
)
df["floats_null_str"] = np.random.choice(
[f"{i * 1.1}" for i in range(-50, 50)] + [pd.NA, np.nan, None],
1_000,
)
df["int_float_mixed"] = np.random.choice(
[f"{i}" for i in range(-50, 50)] + ["3.14"],
1_000,
)
df["int_float_mixed_null"] = np.random.choice(
[f"{i}" for i in range(-50, 50)] + ["3.14", pd.NA, np.nan, None],
1_000,
)
df["bools"] = np.random.choice([True, False], 1_000)
df["bools_str"] = np.random.choice(["y", "n"], 1_000)
df["bools_null"] = np.random.choice([True, False, pd.NA], 1_000)
df["bools_null_str"] = np.random.choice(["y", "n", pd.NA], 1_000)
df["datetimes"] = pd.date_range("01/01/1995", freq="3D", periods=1_000)
df["datetimes_str"] = [
"01-05-12",
"01-11-04",
"03-21-11",
"11-01-19",
"12-28-01",
] * 200
df["datetimes_null_str"] = [
"01-05-12",
"01-11-04",
"03-21-11",
"11-01-19",
"12-28-01",
"04-21-15",
"06-20-98",
"10-09-99",
"01-03-00",
pd.NA,
] * 100
return df


@pytest.fixture()
def comprehensive_df_dask(comprehensive_df_pandas):
dd = pytest.importorskip("dask.dataframe", reason="Dask not installed, skipping")
return dd.from_pandas(comprehensive_df_pandas, npartitions=1)


@pytest.fixture()
def comprehensive_df_spark(comprehensive_df_pandas):
ps = pytest.importorskip(
"pyspark.pandas",
reason="Pyspark pandas not installed, skipping",
)
return ps.from_pandas(comprehensive_df_pandas)


@pytest.fixture()
def sample_df_phone_numbers():
return pd.DataFrame(
Expand Down Expand Up @@ -1008,6 +1092,17 @@ def skewed_outliers_df_pandas():
+ [6] * 5
+ [7] * 3
+ [8, 8, 9, 9, 10, 11, 13, 14, 16, 30],
"right_skewed_outliers_nullable_int": pd.Series(
[1] * 2
+ [2] * 6
+ [3] * 20
+ [4] * 12
+ [5] * 8
+ [6] * 5
+ [7] * 3
+ [8, 8, 9, 9, 10, 11, 13, 14, 16, 30],
dtype="Int64",
simha104 marked this conversation as resolved.
Show resolved Hide resolved
),
"no_outliers": [60, 42, 37, 23, 49, 42, 36, 57, 60, 23.0] * 6
+ [35, 54, 43, 47, 41, 39],
"non_numeric": ["a"] * 66,
Expand Down
59 changes: 56 additions & 3 deletions woodwork/tests/logical_types/test_logical_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@
import pandas as pd
import pytest

from woodwork.accessor_utils import _is_dask_series, _is_spark_series, init_series
from woodwork.accessor_utils import (
_is_dask_dataframe,
_is_dask_series,
_is_spark_dataframe,
_is_spark_series,
init_series,
)
from woodwork.config import config
from woodwork.exceptions import (
TypeConversionError,
Expand All @@ -30,6 +36,7 @@
Ordinal,
PhoneNumber,
PostalCode,
Unknown,
_replace_nans,
)
from woodwork.tests.testing_utils.table_utils import (
Expand Down Expand Up @@ -729,7 +736,7 @@ def test_integer_nullable(data_type, null_type):

@pytest.mark.parametrize(
"null_type",
[None, pd.NA, pd.NaT, np.nan, "null", "N/A", "mix", True],
[None, pd.NA, pd.NaT, np.nan, "null", " ", "N/A", "mix", True],
)
def test_boolean_nullable(null_type):
nullable_bools = pd.DataFrame([True, False] * 50, columns=["bool_nulls"])
Expand Down Expand Up @@ -955,7 +962,8 @@ def test_datetime_formats_two_digit_years(datetime_different_formats):
final_format = "%Y-%m-%d %H:%M:%S" if "%H:%M:%S" in format_ else "%Y-%m-%d"
expected_values = [
datetime.strptime(
starting_date_.replace("24", str(each)), format_
starting_date_.replace("24", str(each)),
format_,
).strftime(final_format)
for each in range(24, 90, 4)
]
Expand Down Expand Up @@ -1164,3 +1172,48 @@ def test_coerce_boolean_not_called_for_bool_dtype(coerce_boolean_patch):
series_init = init_series(series)
assert not coerce_boolean_patch.called
assert series_init.dtype == "bool"


def test_object_dtype_inference(comprehensive_df):
expected = {
"ints": "Integer",
"ints_str": "Integer",
"ints_null": "IntegerNullable",
"ints_null_str": "IntegerNullable",
"floats": "Double",
"floats_str": "Double",
"floats_null": "Double",
"floats_null_str": "Double",
"int_float_mixed": "Double",
"int_float_mixed_null": "Double",
"bools": "Boolean",
"bools_str": "Boolean",
"bools_null": "BooleanNullable",
"bools_null_str": "BooleanNullable",
"datetimes": "Datetime",
"datetimes_str": "Datetime",
"datetimes_null_str": "Datetime",
}
df_copy = comprehensive_df.copy()
df_copy_objects = comprehensive_df.copy()
df_copy.ww.init()
df_copy_objects.ww.init(
logical_types={col: Unknown for col in df_copy_objects.columns},
)
if _is_dask_dataframe(df_copy):
df_copy = df_copy.ww.compute()
df_copy_objects = df_copy_objects.ww.compute()
elif _is_spark_dataframe(df_copy):
df_copy = df_copy.ww.to_pandas()
df_copy_objects = df_copy_objects.ww.to_pandas()
# Confirm proper Woodwork inference for pandas-inferred object columns
assert {
col: str(ltype) for col, ltype in df_copy.ww.logical_types.items()
} == expected
for col in df_copy_objects:
df_copy_objects[col] = df_copy_objects[col].astype("object")
df_copy_objects.ww.init()
# Confirm proper Woodwork inference when every column is converted to string and then cast to object
assert {
col: str(ltype) for col, ltype in df_copy_objects.ww.logical_types.items()
} == expected
5 changes: 4 additions & 1 deletion woodwork/tests/type_system/test_ltype_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,11 @@ def test_categorical_inference(categories):
if ind == len(categories) - 1:
dtypes = ["string", "category"]
for dtype in dtypes:
expected_ltype = Categorical
if ind in [1, 3] and dtype == "object":
expected_ltype = Integer
inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
assert isinstance(inferred_type, Categorical)
assert isinstance(inferred_type, expected_ltype)


def test_postal_inference(postal):
Expand Down
3 changes: 2 additions & 1 deletion woodwork/tests/utils/test_read_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,8 @@ def test_replace_nan_strings_with_read_file(tmpdir):
filepath=filepath,
replace_nan=False,
)
assert actual.isnull().sum().sum() == 1

assert actual.isnull().sum().sum() == 3

# With replacement
actual = ww.read_file(
Expand Down
2 changes: 1 addition & 1 deletion woodwork/tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def test_is_numeric_datetime_series(time_index_df):
assert not _is_numeric_series(time_index_df["ints"], Categorical)
assert _is_numeric_series(time_index_df["ints"], Datetime)

assert not _is_numeric_series(time_index_df["strs"], None)
assert _is_numeric_series(time_index_df["strs"], Integer)
assert not _is_numeric_series(time_index_df["strs"], "Categorical")
assert not _is_numeric_series(time_index_df["strs"], Categorical)
assert _is_numeric_series(time_index_df["strs"], Double)
Expand Down
19 changes: 19 additions & 0 deletions woodwork/type_sys/inference_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def categorical_func(series):

def integer_func(series):
if integer_nullable_func(series) and not series.isnull().any():
if pdtypes.is_object_dtype(series.dtype):
return True
return all(series.mod(1).eq(0))
return False

Expand All @@ -66,6 +68,14 @@ def _is_valid_int(value):
return False
series_no_null = series.dropna()
return all([_is_valid_int(v) for v in series_no_null])
elif pdtypes.is_object_dtype(series.dtype):
series_no_null = series.dropna()
try:
return series_no_null.map(
lambda x: (isinstance(x, str) and isinstance(int(x), int)),
).all()
except ValueError:
return False

return False

Expand All @@ -77,6 +87,15 @@ def double_func(series):
return not _is_categorical_series(series, threshold)
else:
return True
elif pdtypes.is_object_dtype(series.dtype):
series_no_null = series.dropna()
try:
# If str and casting to float works, make sure that it isn't just an integer
return series_no_null.map(
lambda x: isinstance(x, str) and not float(x).is_integer(),
).any()
except ValueError:
return False

return False

Expand Down
4 changes: 3 additions & 1 deletion woodwork/type_sys/type_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@

DEFAULT_TYPE = Unknown

INFERENCE_SAMPLE_SIZE = 100000
INFERENCE_SAMPLE_SIZE = 100_000
Copy link
Collaborator Author

@ParthivNaresh ParthivNaresh Feb 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might have to keep this at 100,000 for the time being. Reducing this exposes issues with larger datasets like zillow which has a column that gets inferred as IntegerNullable but actually has a float in one of its >90,000 observations. Attempting to cast this as Int64 throws an error.



class TypeSystem(object):
Expand Down Expand Up @@ -383,6 +383,8 @@ def get_inference_matches(types_to_check, series, type_matches=[]):
Categorical in type_matches or Double in type_matches
) and IntegerNullable in type_matches:
best_match = IntegerNullable
elif Categorical in type_matches and Double in type_matches:
best_match = Double
else:
best_match = type_matches[0]
best_depth = self._get_depth(best_match)
Expand Down
4 changes: 3 additions & 1 deletion woodwork/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,9 @@ def _infer_datetime_format(dates, n=100):

def _parse_latlong(latlong):
nan_values_strs = [
x for x in ww.config.get_option("nan_values") if isinstance(x, str) and len(x)
x
for x in ww.config.get_option("nan_values")
if isinstance(x, str) and len(x) and x != " "
]
nan_values = "|".join(nan_values_strs)

Expand Down