Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seaborn Plots #62

Merged
merged 31 commits into from
Sep 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion composeml/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,30 @@
import pandas as pd
import pytest

from .label_times import LabelTimes
from composeml import LabelTimes
from composeml.tests.utils import read_csv


@pytest.fixture(scope="module")
def total_spent():
data = [
'id,customer_id,cutoff_time,total_spent',
'0,0,2019-01-01 08:00:00,9',
'1,0,2019-01-01 08:30:00,8',
'2,1,2019-01-01 09:00:00,7',
'3,1,2019-01-01 09:30:00,6',
'4,1,2019-01-01 10:00:00,5',
'5,2,2019-01-01 10:30:00,4',
'6,2,2019-01-01 11:00:00,3',
'7,2,2019-01-01 11:30:00,2',
'8,2,2019-01-01 12:00:00,1',
'9,3,2019-01-01 12:30:00,0',
]

data = read_csv(data, index_col='id', parse_dates=['cutoff_time'])
lt = LabelTimes(data=data, name='total_spent')
lt.settings.update({'num_examples_per_instance': -1})
return lt


@pytest.fixture(scope="module")
Expand Down
13 changes: 9 additions & 4 deletions composeml/label_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def to_offset(value):
class LabelMaker:
"""Automatically makes labels for prediction problems."""

def __init__(self, target_entity, time_index, labeling_function, window_size=None):
def __init__(self, target_entity, time_index, labeling_function, window_size=None, label_type=None):
"""Creates an instance of label maker.

Args:
Expand Down Expand Up @@ -255,6 +255,7 @@ def search(self,
minimum_data=None,
gap=None,
drop_empty=True,
label_type=None,
verbose=True,
*args,
**kwargs):
Expand All @@ -267,6 +268,7 @@ def search(self,
gap (str or int) : Time between examples. Default value is window size.
If an integer, search will start on the first event after the minimum data.
drop_empty (bool) : Whether to drop empty slices. Default value is True.
label_type (str) : The label type can be "continuous" or "categorical". Default value is the inferred label type.
kmax12 marked this conversation as resolved.
Show resolved Hide resolved
verbose (bool) : Whether to render progress bar. Default value is True.
*args : Positional arguments for labeling function.
**kwargs : Keyword arguments for labeling function.
Expand Down Expand Up @@ -325,16 +327,19 @@ def search(self,
progress_bar.update(n=total)
progress_bar.close()

labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity)
labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity, label_type=label_type)
labels = labels.rename_axis('id', axis=0)
labels = labels._with_plots()

if labels.empty:
return labels

if labels.is_discrete:
labels[labels.name] = labels[labels.name].astype('category')

labels.settings.update({
'labeling_function': name,
'num_examples_per_instance': num_examples_per_instance,
'minimum_data': minimum_data,
'minimum_data': str(minimum_data),
'window_size': self.window_size,
'gap': gap,
})
Expand Down
88 changes: 88 additions & 0 deletions composeml/label_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import matplotlib as mpl
import pandas as pd
import seaborn as sns

pd.plotting.register_matplotlib_converters()
sns.set_context('notebook')
sns.set_style('darkgrid')
COLOR = sns.color_palette("Set1", n_colors=100, desat=.75)


class LabelPlots:
"""Creates plots for Label Times."""

def __init__(self, label_times):
"""Initializes Label Plots.

Args:
label_times (LabelTimes) : instance of Label Times
"""
self._label_times = label_times

def count_by_time(self, ax=None, **kwargs):
"""Plots the label distribution across cutoff times."""
count_by_time = self._label_times.count_by_time
count_by_time.sort_index(inplace=True)

ax = ax or mpl.pyplot.axes()
vmin = count_by_time.index.min()
vmax = count_by_time.index.max()
ax.set_xlim(vmin, vmax)

locator = mpl.dates.AutoDateLocator()
formatter = mpl.dates.AutoDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
ax.figure.autofmt_xdate()

if len(count_by_time.shape) > 1:
ax.stackplot(
count_by_time.index,
count_by_time.values.T,
labels=count_by_time.columns,
colors=COLOR,
alpha=.9,
**kwargs,
)

ax.legend(
loc='upper left',
title=self._label_times.name,
facecolor='w',
framealpha=.9,
)

ax.set_title('Label Count vs. Cutoff Times')
ax.set_ylabel('Count')
ax.set_xlabel('Time')

else:
ax.fill_between(
count_by_time.index,
count_by_time.values.T,
color=COLOR[1],
)

ax.set_title('Label vs. Cutoff Times')
ax.set_ylabel(self._label_times.name)
ax.set_xlabel('Time')

return ax

@property
def dist(self):
"""Alias for distribution."""
return self.distribution

def distribution(self, **kwargs):
"""Plots the label distribution."""
dist = self._label_times[self._label_times.name]

if self._label_times.is_discrete:
ax = sns.countplot(dist, palette=COLOR, **kwargs)
else:
ax = sns.distplot(dist, kde=True, color=COLOR[1], **kwargs)

ax.set_title('Label Distribution')
ax.set_ylabel('Count')
return ax
121 changes: 78 additions & 43 deletions composeml/label_times.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,87 @@
import pandas as pd

from composeml.label_plots import LabelPlots


class LabelTimes(pd.DataFrame):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the label times class should store the type of label it is

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added label_type attribute to label times

"""
A data frame containing labels made by a label maker.
"""A data frame containing labels made by a label maker.

Attributes:
name
target_entity
transforms
"""
_metadata = ['name', 'target_entity', 'settings', 'transforms']

def __init__(self, data=None, name=None, target_entity=None, settings=None, transforms=None, *args, **kwargs):
_metadata = ['name', 'target_entity', 'settings', 'transforms', 'label_type']

def __init__(self,
data=None,
name=None,
target_entity=None,
settings=None,
transforms=None,
label_type=None,
*args,
**kwargs):
super().__init__(data=data, *args, **kwargs)

self.name = name
self.target_entity = target_entity
self.settings = settings or {}
self.transforms = transforms or []
self.plot = LabelPlots(self)

if label_type is not None:
error = 'label type must be "continuous" or "discrete"'
assert label_type in ['continuous', 'discrete'], error

self.label_type = label_type
self.settings = settings or {}
self.settings['label_type'] = self.label_type

@property
def _constructor(self):
return LabelTimes

@property
def distribution(self):
labels = self.assign(count=1)
labels = labels.groupby(self.name)
distribution = labels['count'].count()
return distribution
def is_discrete(self):
"""Whether labels are discrete."""
if self.label_type is None:
self.label_type = self.infer_type()
self.settings['label_type'] = self.label_type

def _plot_distribution(self, **kwargs):
plot = self.distribution.plot(kind='bar', **kwargs)
plot.set_title('Label Distribution')
plot.set_ylabel('count')
return plot
return self.label_type == 'discrete'

@property
def distribution(self):
"""Returns label distribution if labels are discrete."""
if self.is_discrete:
labels = self.assign(count=1)
labels = labels.groupby(self.name)
distribution = labels['count'].count()
return distribution

@property
def count_by_time(self):
count = self.assign(count=1)
count = count.sort_values('cutoff_time')
count = count.set_index([self.name, 'cutoff_time'])
count = count.groupby(self.name)
count = count['count'].cumsum()
return count

def _plot_count_by_time(self, **kwargs):
count = self.count_by_time
count = count.unstack(self.name)
count = count.ffill()

plot = count.plot(kind='area', **kwargs)
plot.set_title('Label Count vs. Time')
plot.set_ylabel('count')
return plot

def _with_plots(self):
self.plot.count_by_time = self._plot_count_by_time
self.plot.distribution = self._plot_distribution
return self
"""Returns label count across cutoff times."""
if self.is_discrete:
keys = ['cutoff_time', self.name]
value = self.groupby(keys).cutoff_time.count()
value = value.unstack(self.name).fillna(0)
value = value.cumsum()
return value
else:
value = self.groupby('cutoff_time')
value = value[self.name].count()
value = value.cumsum()
return value

def describe(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the describe method should say the label type

Copy link
Collaborator Author

@jeff-hernandez jeff-hernandez Sep 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should label type be under settings? I guess it is a parameter of search.

"""Prints out label info with transform settings that reproduce labels."""
print('Label Distribution\n' + '-' * 18, end='\n')
distribution = self[self.name].value_counts()
distribution.index = distribution.index.astype('str')
distribution['Total:'] = distribution.sum()
print(distribution.to_string(), end='\n\n\n')
if self.is_discrete:
print('Label Distribution\n' + '-' * 18, end='\n')
distribution = self[self.name].value_counts()
distribution.index = distribution.index.astype('str')
distribution['Total:'] = distribution.sum()
print(distribution.to_string(), end='\n\n\n')

print('Settings\n' + '-' * 8, end='\n')
settings = pd.Series(self.settings)
Expand Down Expand Up @@ -99,7 +113,7 @@ def copy(self):
"""
labels = super().copy()
labels.transforms = labels.transforms.copy()
return labels._with_plots()
return labels

def threshold(self, value, inplace=False):
"""
Expand All @@ -115,6 +129,9 @@ def threshold(self, value, inplace=False):
labels = self if inplace else self.copy()
labels[self.name] = labels[self.name].gt(value)

labels.label_type = 'discrete'
labels.settings['label_type'] = 'discrete'

transform = {'__name__': 'threshold', 'value': value}
labels.transforms.append(transform)

Expand Down Expand Up @@ -225,6 +242,8 @@ def bin(self, bins, quantiles=False, labels=None, right=True):
}

label_times.transforms.append(transform)
label_times.label_type = 'discrete'
label_times.settings['label_type'] = 'discrete'
return label_times

def sample(self, n=None, frac=None, random_state=None):
Expand Down Expand Up @@ -318,3 +337,19 @@ def sample(self, n=None, frac=None, random_state=None):

labels = pd.concat(sample_per_label, axis=0, sort=False)
return labels

def infer_type(self):
kmax12 marked this conversation as resolved.
Show resolved Hide resolved
"""Infer label type.

Returns:
str : Inferred label type. Either "continuous" or "discrete".
"""
dtype = self[self.name].dtype
is_discrete = pd.api.types.is_bool_dtype(dtype)
is_discrete = is_discrete or pd.api.types.is_categorical_dtype(dtype)
is_discrete = is_discrete or pd.api.types.is_object_dtype(dtype)

if is_discrete:
return 'discrete'
else:
return 'continuous'
6 changes: 6 additions & 0 deletions composeml/tests/test_label_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,9 @@ def test_slice_overlap(transactions):
start, end = metadata['window']
is_overlap = df.index == end
assert not is_overlap.any()


def test_label_type(transactions):
lm = LabelMaker(target_entity='customer_id', time_index='time', labeling_function=total_spent)
lt = lm.search(transactions, num_examples_per_instance=1, label_type='discrete', verbose=False)
assert lt.label_type == 'discrete'
27 changes: 19 additions & 8 deletions composeml/tests/test_label_plots.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
def test_distribution_plot(labels):
labels = labels.threshold(200)
plot = labels.plot.distribution()
assert plot.get_title() == 'Label Distribution'
def test_count_by_time_categorical(total_spent):
labels = range(2)
total_spent = total_spent.bin(2, labels=labels)
ax = total_spent.plot.count_by_time()
assert ax.get_title() == 'Label Count vs. Cutoff Times'


def test_count_by_time_plot(labels):
labels = labels.threshold(200)
plot = labels.plot.count_by_time()
assert plot.get_title() == 'Label Count vs. Time'
def test_count_by_time_continuous(total_spent):
ax = total_spent.plot.count_by_time()
assert ax.get_title() == 'Label vs. Cutoff Times'


def test_distribution_categorical(total_spent):
ax = total_spent.bin(2, labels=range(2))
ax = ax.plot.dist()
assert ax.get_title() == 'Label Distribution'


def test_distribution_continuous(total_spent):
ax = total_spent.plot.dist()
assert ax.get_title() == 'Label Distribution'
Loading