diff --git a/composeml/conftest.py b/composeml/conftest.py index 7339d36c..d4d72014 100644 --- a/composeml/conftest.py +++ b/composeml/conftest.py @@ -1,7 +1,30 @@ import pandas as pd import pytest -from .label_times import LabelTimes +from composeml import LabelTimes +from composeml.tests.utils import read_csv + + +@pytest.fixture(scope="module") +def total_spent(): + data = [ + 'id,customer_id,cutoff_time,total_spent', + '0,0,2019-01-01 08:00:00,9', + '1,0,2019-01-01 08:30:00,8', + '2,1,2019-01-01 09:00:00,7', + '3,1,2019-01-01 09:30:00,6', + '4,1,2019-01-01 10:00:00,5', + '5,2,2019-01-01 10:30:00,4', + '6,2,2019-01-01 11:00:00,3', + '7,2,2019-01-01 11:30:00,2', + '8,2,2019-01-01 12:00:00,1', + '9,3,2019-01-01 12:30:00,0', + ] + + data = read_csv(data, index_col='id', parse_dates=['cutoff_time']) + lt = LabelTimes(data=data, name='total_spent') + lt.settings.update({'num_examples_per_instance': -1}) + return lt @pytest.fixture(scope="module") diff --git a/composeml/label_maker.py b/composeml/label_maker.py index 29746029..92495ca2 100644 --- a/composeml/label_maker.py +++ b/composeml/label_maker.py @@ -138,7 +138,7 @@ def __str__(self): class LabelMaker: """Automatically makes labels for prediction problems.""" - def __init__(self, target_entity, time_index, labeling_function, window_size=None): + def __init__(self, target_entity, time_index, labeling_function, window_size=None, label_type=None): """Creates an instance of label maker. Args: @@ -285,6 +285,7 @@ def search(self, minimum_data=None, gap=None, drop_empty=True, + label_type=None, verbose=True, *args, **kwargs): @@ -297,6 +298,7 @@ def search(self, gap (str or int) : Time between examples. Default value is window size. If an integer, search will start on the first event after the minimum data. drop_empty (bool) : Whether to drop empty slices. Default value is True. + label_type (str) : The label type can be "continuous" or "categorical". Default value is the inferred label type. verbose (bool) : Whether to render progress bar. Default value is True. *args : Positional arguments for labeling function. **kwargs : Keyword arguments for labeling function. @@ -353,16 +355,19 @@ def search(self, progress_bar.update(n=total) progress_bar.close() - labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity) + labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity, label_type=label_type) labels = labels.rename_axis('id', axis=0) - labels = labels._with_plots() if labels.empty: return labels + if labels.is_discrete: + labels[labels.name] = labels[labels.name].astype('category') + labels.settings.update({ + 'labeling_function': name, 'num_examples_per_instance': num_examples_per_instance, - 'minimum_data': minimum_data, + 'minimum_data': str(minimum_data), 'window_size': self.window_size, 'gap': gap, }) diff --git a/composeml/label_plots.py b/composeml/label_plots.py new file mode 100644 index 00000000..efd523b9 --- /dev/null +++ b/composeml/label_plots.py @@ -0,0 +1,88 @@ +import matplotlib as mpl +import pandas as pd +import seaborn as sns + +pd.plotting.register_matplotlib_converters() +sns.set_context('notebook') +sns.set_style('darkgrid') +COLOR = sns.color_palette("Set1", n_colors=100, desat=.75) + + +class LabelPlots: + """Creates plots for Label Times.""" + + def __init__(self, label_times): + """Initializes Label Plots. + + Args: + label_times (LabelTimes) : instance of Label Times + """ + self._label_times = label_times + + def count_by_time(self, ax=None, **kwargs): + """Plots the label distribution across cutoff times.""" + count_by_time = self._label_times.count_by_time + count_by_time.sort_index(inplace=True) + + ax = ax or mpl.pyplot.axes() + vmin = count_by_time.index.min() + vmax = count_by_time.index.max() + ax.set_xlim(vmin, vmax) + + locator = mpl.dates.AutoDateLocator() + formatter = mpl.dates.AutoDateFormatter(locator) + ax.xaxis.set_major_locator(locator) + ax.xaxis.set_major_formatter(formatter) + ax.figure.autofmt_xdate() + + if len(count_by_time.shape) > 1: + ax.stackplot( + count_by_time.index, + count_by_time.values.T, + labels=count_by_time.columns, + colors=COLOR, + alpha=.9, + **kwargs, + ) + + ax.legend( + loc='upper left', + title=self._label_times.name, + facecolor='w', + framealpha=.9, + ) + + ax.set_title('Label Count vs. Cutoff Times') + ax.set_ylabel('Count') + ax.set_xlabel('Time') + + else: + ax.fill_between( + count_by_time.index, + count_by_time.values.T, + color=COLOR[1], + ) + + ax.set_title('Label vs. Cutoff Times') + ax.set_ylabel(self._label_times.name) + ax.set_xlabel('Time') + + return ax + + @property + def dist(self): + """Alias for distribution.""" + return self.distribution + + def distribution(self, **kwargs): + """Plots the label distribution.""" + dist = self._label_times[self._label_times.name] + + if self._label_times.is_discrete: + ax = sns.countplot(dist, palette=COLOR, **kwargs) + else: + ax = sns.distplot(dist, kde=True, color=COLOR[1], **kwargs) + + ax.set_title('Label Distribution') + ax.set_ylabel('Count') + return ax diff --git a/composeml/label_times.py b/composeml/label_times.py index cdbf4267..96243fca 100644 --- a/composeml/label_times.py +++ b/composeml/label_times.py @@ -1,73 +1,87 @@ import pandas as pd +from composeml.label_plots import LabelPlots + class LabelTimes(pd.DataFrame): - """ - A data frame containing labels made by a label maker. + """A data frame containing labels made by a label maker. Attributes: name target_entity transforms """ - _metadata = ['name', 'target_entity', 'settings', 'transforms'] - - def __init__(self, data=None, name=None, target_entity=None, settings=None, transforms=None, *args, **kwargs): + _metadata = ['name', 'target_entity', 'settings', 'transforms', 'label_type'] + + def __init__(self, + data=None, + name=None, + target_entity=None, + settings=None, + transforms=None, + label_type=None, + *args, + **kwargs): super().__init__(data=data, *args, **kwargs) self.name = name self.target_entity = target_entity - self.settings = settings or {} self.transforms = transforms or [] + self.plot = LabelPlots(self) + + if label_type is not None: + error = 'label type must be "continuous" or "discrete"' + assert label_type in ['continuous', 'discrete'], error + + self.label_type = label_type + self.settings = settings or {} + self.settings['label_type'] = self.label_type @property def _constructor(self): return LabelTimes @property - def distribution(self): - labels = self.assign(count=1) - labels = labels.groupby(self.name) - distribution = labels['count'].count() - return distribution + def is_discrete(self): + """Whether labels are discrete.""" + if self.label_type is None: + self.label_type = self.infer_type() + self.settings['label_type'] = self.label_type - def _plot_distribution(self, **kwargs): - plot = self.distribution.plot(kind='bar', **kwargs) - plot.set_title('Label Distribution') - plot.set_ylabel('count') - return plot + return self.label_type == 'discrete' + + @property + def distribution(self): + """Returns label distribution if labels are discrete.""" + if self.is_discrete: + labels = self.assign(count=1) + labels = labels.groupby(self.name) + distribution = labels['count'].count() + return distribution @property def count_by_time(self): - count = self.assign(count=1) - count = count.sort_values('cutoff_time') - count = count.set_index([self.name, 'cutoff_time']) - count = count.groupby(self.name) - count = count['count'].cumsum() - return count - - def _plot_count_by_time(self, **kwargs): - count = self.count_by_time - count = count.unstack(self.name) - count = count.ffill() - - plot = count.plot(kind='area', **kwargs) - plot.set_title('Label Count vs. Time') - plot.set_ylabel('count') - return plot - - def _with_plots(self): - self.plot.count_by_time = self._plot_count_by_time - self.plot.distribution = self._plot_distribution - return self + """Returns label count across cutoff times.""" + if self.is_discrete: + keys = ['cutoff_time', self.name] + value = self.groupby(keys).cutoff_time.count() + value = value.unstack(self.name).fillna(0) + value = value.cumsum() + return value + else: + value = self.groupby('cutoff_time') + value = value[self.name].count() + value = value.cumsum() + return value def describe(self): """Prints out label info with transform settings that reproduce labels.""" - print('Label Distribution\n' + '-' * 18, end='\n') - distribution = self[self.name].value_counts() - distribution.index = distribution.index.astype('str') - distribution['Total:'] = distribution.sum() - print(distribution.to_string(), end='\n\n\n') + if self.is_discrete: + print('Label Distribution\n' + '-' * 18, end='\n') + distribution = self[self.name].value_counts() + distribution.index = distribution.index.astype('str') + distribution['Total:'] = distribution.sum() + print(distribution.to_string(), end='\n\n\n') print('Settings\n' + '-' * 8, end='\n') settings = pd.Series(self.settings) @@ -99,7 +113,7 @@ def copy(self): """ labels = super().copy() labels.transforms = labels.transforms.copy() - return labels._with_plots() + return labels def threshold(self, value, inplace=False): """ @@ -115,6 +129,9 @@ def threshold(self, value, inplace=False): labels = self if inplace else self.copy() labels[self.name] = labels[self.name].gt(value) + labels.label_type = 'discrete' + labels.settings['label_type'] = 'discrete' + transform = {'__name__': 'threshold', 'value': value} labels.transforms.append(transform) @@ -225,6 +242,8 @@ def bin(self, bins, quantiles=False, labels=None, right=True): } label_times.transforms.append(transform) + label_times.label_type = 'discrete' + label_times.settings['label_type'] = 'discrete' return label_times def sample(self, n=None, frac=None, random_state=None): @@ -318,3 +337,19 @@ def sample(self, n=None, frac=None, random_state=None): labels = pd.concat(sample_per_label, axis=0, sort=False) return labels + + def infer_type(self): + """Infer label type. + + Returns: + str : Inferred label type. Either "continuous" or "discrete". + """ + dtype = self[self.name].dtype + is_discrete = pd.api.types.is_bool_dtype(dtype) + is_discrete = is_discrete or pd.api.types.is_categorical_dtype(dtype) + is_discrete = is_discrete or pd.api.types.is_object_dtype(dtype) + + if is_discrete: + return 'discrete' + else: + return 'continuous' diff --git a/composeml/tests/test_label_maker.py b/composeml/tests/test_label_maker.py index cc347332..33ac798c 100644 --- a/composeml/tests/test_label_maker.py +++ b/composeml/tests/test_label_maker.py @@ -421,3 +421,9 @@ def test_slice_overlap(transactions): start, end = df.context.window is_overlap = df.index == end assert not is_overlap.any() + + +def test_label_type(transactions): + lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent) + lt = lm.search(transactions, num_examples_per_instance=1, label_type='discrete', verbose=False) + assert lt.label_type == 'discrete' diff --git a/composeml/tests/test_label_plots.py b/composeml/tests/test_label_plots.py index ad4d8f87..b77629fc 100644 --- a/composeml/tests/test_label_plots.py +++ b/composeml/tests/test_label_plots.py @@ -1,10 +1,21 @@ -def test_distribution_plot(labels): - labels = labels.threshold(200) - plot = labels.plot.distribution() - assert plot.get_title() == 'Label Distribution' +def test_count_by_time_categorical(total_spent): + labels = range(2) + total_spent = total_spent.bin(2, labels=labels) + ax = total_spent.plot.count_by_time() + assert ax.get_title() == 'Label Count vs. Cutoff Times' -def test_count_by_time_plot(labels): - labels = labels.threshold(200) - plot = labels.plot.count_by_time() - assert plot.get_title() == 'Label Count vs. Time' +def test_count_by_time_continuous(total_spent): + ax = total_spent.plot.count_by_time() + assert ax.get_title() == 'Label vs. Cutoff Times' + + +def test_distribution_categorical(total_spent): + ax = total_spent.bin(2, labels=range(2)) + ax = ax.plot.dist() + assert ax.get_title() == 'Label Distribution' + + +def test_distribution_continuous(total_spent): + ax = total_spent.plot.dist() + assert ax.get_title() == 'Label Distribution' diff --git a/composeml/tests/test_label_times.py b/composeml/tests/test_label_times.py index b1305a5c..97b10fba 100644 --- a/composeml/tests/test_label_times.py +++ b/composeml/tests/test_label_times.py @@ -1,9 +1,77 @@ -def test_describe(labels): - labels = labels.bin(2) - labels.settings.update(num_examples_per_instance=2) - assert labels.describe() is None +def test_count_by_time_categorical(total_spent): + labels = range(2) + given_answer = total_spent.bin(2, labels=labels).count_by_time + given_answer = given_answer.to_csv(header=True).splitlines() + answer = [ + 'cutoff_time,0,1', + '2019-01-01 08:00:00,0.0,1.0', + '2019-01-01 08:30:00,0.0,2.0', + '2019-01-01 09:00:00,0.0,3.0', + '2019-01-01 09:30:00,0.0,4.0', + '2019-01-01 10:00:00,0.0,5.0', + '2019-01-01 10:30:00,1.0,5.0', + '2019-01-01 11:00:00,2.0,5.0', + '2019-01-01 11:30:00,3.0,5.0', + '2019-01-01 12:00:00,4.0,5.0', + '2019-01-01 12:30:00,5.0,5.0', + ] -def test_describe_empty(labels): - labels.settings.clear() - assert labels.describe() is None + assert given_answer == answer + + +def test_count_by_time_continuous(total_spent): + given_answer = total_spent.count_by_time + given_answer = given_answer.to_csv(header=True).splitlines() + + answer = [ + 'cutoff_time,total_spent', + '2019-01-01 08:00:00,1', + '2019-01-01 08:30:00,2', + '2019-01-01 09:00:00,3', + '2019-01-01 09:30:00,4', + '2019-01-01 10:00:00,5', + '2019-01-01 10:30:00,6', + '2019-01-01 11:00:00,7', + '2019-01-01 11:30:00,8', + '2019-01-01 12:00:00,9', + '2019-01-01 12:30:00,10', + ] + + assert given_answer == answer + + +def test_describe(total_spent): + assert total_spent.bin(2).describe() is None + + +def test_describe_no_settings(total_spent): + total_spent = total_spent.copy() + total_spent.settings.clear() + assert total_spent.describe() is None + + +def test_distribution_categorical(total_spent): + labels = range(2) + given_answer = total_spent.bin(2, labels=labels).distribution + given_answer = given_answer.to_csv(header=True).splitlines() + + answer = [ + 'total_spent,count', + '0,5', + '1,5', + ] + + assert given_answer == answer + + +def test_distribution_continous(total_spent): + assert total_spent.distribution is None + + +def test_infer_type(total_spent): + assert total_spent.infer_type() == 'continuous' + + total_spent = total_spent.threshold(5) + total_spent.label_type = None + assert total_spent.infer_type() == 'discrete' diff --git a/composeml/tests/test_label_transforms/test_threshold.py b/composeml/tests/test_label_transforms/test_threshold.py index 36a6c937..cffbe49e 100644 --- a/composeml/tests/test_label_transforms/test_threshold.py +++ b/composeml/tests/test_label_transforms/test_threshold.py @@ -1,13 +1,10 @@ -import pandas as pd - - def test_threshold(labels): - given_labels = labels.threshold(200) - transform = given_labels.transforms[0] + labels = labels.threshold(200) + transform = labels.transforms[0] assert transform['__name__'] == 'threshold' assert transform['value'] == 200 answer = [True, False, True, False] - labels = labels.assign(my_labeling_function=answer) - pd.testing.assert_frame_equal(given_labels, labels) + given_answer = labels[labels.name].values.tolist() + assert given_answer == answer diff --git a/composeml/tests/utils.py b/composeml/tests/utils.py index ddc5bb2f..da8cdea1 100644 --- a/composeml/tests/utils.py +++ b/composeml/tests/utils.py @@ -3,11 +3,11 @@ import pandas as pd -def read_csv(csv): +def read_csv(csv, **kwargs): if isinstance(csv, list): csv = '\n'.join(csv) with StringIO(csv) as file: - df = pd.read_csv(file) + df = pd.read_csv(file, **kwargs) return df diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index 460c4bcd..69b1c966 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -35,14 +35,23 @@ Transform Methods LabelTimes.sample LabelTimes.threshold +.. currentmodule:: composeml.label_plots + +Label Plots +=========== + +.. autosummary:: + :toctree: generated + :template: class.rst + :nosignatures: + + LabelPlots + Plotting Methods ---------------- -.. list-table:: - :widths: 25 75 - :header-rows: 0 +.. autosummary:: + :nosignatures: - * - :mod:`LabelTimes.plot.distribution` - - Plot the label distribution. - * - :mod:`LabelTimes.plot.count_by_time` - - Plot the label count vs. time. + LabelPlots.count_by_time + LabelPlots.distribution diff --git a/docs/source/examples/predict-next-purchase/example.ipynb b/docs/source/examples/predict-next-purchase/example.ipynb index 1157c15c..678e6bfe 100644 --- a/docs/source/examples/predict-next-purchase/example.ipynb +++ b/docs/source/examples/predict-next-purchase/example.ipynb @@ -234,7 +234,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 01:30 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 19477/19477 \n" + "Elapsed: 01:37 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 19477/19477 \n" ] }, { @@ -374,6 +374,73 @@ "source": [ "lt.describe()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot Labels\n", + "\n", + "Additionally, there are plots available for insight to the labels.\n", + "\n", + "\n", + "#### Distribution\n", + "\n", + "This plot shows the label distribution." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "lt.plot.distribution();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Count by Time\n", + "\n", + "This plot shows the label distribution across cutoff times." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "lt.plot.count_by_time();" + ] } ], "metadata": { diff --git a/docs/source/getting_started.ipynb b/docs/source/getting_started.ipynb index 2048871d..43b4c9a5 100644 --- a/docs/source/getting_started.ipynb +++ b/docs/source/getting_started.ipynb @@ -10,8 +10,24 @@ "Getting Started\n", "===============\n", "\n", - "In this example, we will generate labels on a mock dataset of transactions. For each customer, we want to label whether the total purchase amount over the next hour of transactions will exceed $300. Additionally, we want to predict one hour in advance.\n", - "\n", + "In this example, we will generate labels on a mock dataset of transactions. For each customer, we want to label whether the total purchase amount over the next hour of transactions will exceed $300. Additionally, we want to predict one hour in advance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import composeml as cp" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ "Load Data\n", "=========\n", "\n", @@ -24,8 +40,6 @@ "metadata": {}, "outputs": [], "source": [ - "import composeml as cp\n", - "\n", "df = cp.demos.load_transactions()\n", "\n", "df[df.columns[:7]].head()" @@ -192,18 +206,41 @@ "\n", "Also, there are plots available for insight to the labels.\n", "\n", - ".. code-block:: python\n", "\n", - " import matplotlib.pyplot as plt\n", + "Distribution\n", + "------------\n", "\n", - " fig, axs = plt.subplots(1,2)\n", - " fig.subplots_adjust(wspace=.34)\n", + "This plot shows the label distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels.plot.distribution();" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + "Count by Time\n", + "-------------\n", "\n", - " color = ['#4285F4', '#DB4437']\n", - " labels.plot.distribution(color=color, ax=axs[0])\n", - " labels.plot.count_by_time(figsize=(12, 4), color=color, ax=axs[1]);\n", - " \n", - ".. image:: images/getting_started_0.0.png\n" + "This plot shows the label distribution across cutoff times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels.plot.count_by_time();" ] } ], diff --git a/docs/source/guides/using_label_transforms.ipynb b/docs/source/guides/using_label_transforms.ipynb index 7711ff48..0fe2106e 100644 --- a/docs/source/guides/using_label_transforms.ipynb +++ b/docs/source/guides/using_label_transforms.ipynb @@ -49,6 +49,7 @@ "labels = label_maker.search(\n", " cp.demos.load_transactions(),\n", " num_examples_per_instance=10,\n", + " label_type='continuous',\n", " minimum_data='2h',\n", " gap='2min',\n", " verbose=True,\n", diff --git a/requirements.txt b/requirements.txt index c1547c10..6ebc7499 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pandas>=0.23.0 numpy>=1.13.3 tqdm>=4.19.2 -matplotlib>=3.0.2 \ No newline at end of file +matplotlib>=3.0.2 +seaborn>=0.9.0