In [1]:
import pandas as pd

## Import Woodwork (to hit decorator)

In [2]:
import woodwork as ww

## Create DataFrame

In [3]:
# df = ww.demo.load_retail(nrows=100, return_dataframe=True)
# df.to_csv('retail_data.csv', index=False)

df = pd.read_csv('retail_data.csv', nrows=100)
df.head(5)

Unnamed: 0,order_product_id,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.2075,Andrea Brown,United Kingdom,25.245,False
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,4.5375,Andrea Brown,United Kingdom,36.3,False
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False


### DataFrame Dtypes

In [4]:
df.dtypes

order_product_id      int64
order_id              int64
product_id           object
description          object
quantity              int64
order_date           object
unit_price          float64
customer_name        object
country              object
total               float64
cancelled              bool
dtype: object

## Use the Woodwork Accessor

The point of the accessor is to have a dataframe that we can work with as normal while also having a the information from a DataTable available. The way this works is that each DataFrame object will have a DataTableAccessor available to it. This means that if you reassign the dataframe, you lose your typing information from the DataTable.

We're going to walk through some use cases, but first we have to know how to set up the DataTable accessor according to our preferences.

We can look at the DataTableAccessor as follows:

In [5]:
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['numeric']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']


Notice how this is the repr of a DataTable! The above result is the same as if you did `DataTable(df)`. There are no extra parameters provided to the DataTable, so we just have the automatically inferred Logical Types and semantic tags. You can access all methods and properties on a DataTable just like this.

Notice that the dtypes of the dataframe have changed to match those of the DataTable Accessor.

In [6]:
df.dtypes

order_product_id             Int64
order_id                     Int64
product_id                category
description                 string
quantity                     Int64
order_date          datetime64[ns]
unit_price                 float64
customer_name               string
country                     string
total                      float64
cancelled                  boolean
dtype: object

Likely, we'll want to set our own Logical Types or semantic tags. We can do that using any of DataTable's methods!

## Set DataTable attributes via DataTable methods

Notice how we need to set `df.ww.dt` any time a method returns a new DataTable. We're currently pulling methods directly from DataTable, which means that any DataTable attribute is available to us, but we have to follow its rules and update the `dt` on the accessor.

### Set Semantic Tags

In [7]:
df.ww.dt = df.ww.add_semantic_tags({'order_date': 'tag_1'})
df.ww.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['numeric']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['tag_1']
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']


### Set the time index

In [8]:
df.ww.dt = df.ww.set_time_index('order_date')
df.ww.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['numeric']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,"['time_index', 'tag_1']"
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']


Of course, it can be tedious to set each element of the DataTable individually, so the DataTable accessor provides an `init` method, which lets us pass the parameters we'd pass to DataTable.

We'll reread the csv to start from ground zero with the DataTable.

### Set Initial DataTable Attributes (index, time index, name, logical types)

In [9]:
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['index']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']


If we look at the dtypes on the dataframe, we'll see the dtypes of `quantity` and `country` have changed as well!

In [10]:
df.dtypes

order_product_id             Int64
order_id                     Int64
product_id                category
description                 string
quantity                     Int64
order_date          datetime64[ns]
unit_price                 float64
customer_name               string
country                   category
total                      float64
cancelled                  boolean
dtype: object

## Working with Your DataFrame + DataTable Accessor

### Select Based on Tags / Logical Types

Now that we have our types set, let's use them! Say we want to a DataFrame that only has `FullName` and `CountryCode` columns. We'll use `select` for that, giving us an updated DataTable with just the relevant columns.

But, our actual goal here is to get the DataFrame. This is how we can do that:

In [11]:
selected_dt = df.ww.select(['FullName', 'CountryCode'])
selected_dt

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
customer_name,string,FullName,[]
country,category,CountryCode,['category']


Now we can easily get that DataFrame, but we've lost some of our DataTable typing info.

In [12]:
df = selected_dt.to_dataframe()
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
customer_name,string,NaturalLanguage,[]
country,category,Categorical,['category']


The good news is that our DataTable will match our dataframe perfectly, so we can easiy fix that:

In [13]:
df.ww.dt = selected_dt
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
customer_name,string,FullName,[]
country,category,CountryCode,['category']


### Update your DataFrame

In the example above, we performed an action on a DataTable and had to get out the DataFrame and update it to keep the typing info up to date. The converse is also possible.

If we want to change a DataFrame--say via `sample`--we will get a new DataFrame devoid of any typing info, so we'll need to update it. Let's see how we can do that:

In [14]:
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')
original_dt = df.ww.dt
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['index']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']


In [15]:
df = df.sample(50)
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['numeric']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,category,Categorical,['category']
total,float64,Double,['numeric']


In [16]:
original_dt.update_dataframe(df)
df.ww.dt = original_dt
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['index']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']


These two examples above highlight the benefits of keeping your DataFrame and DataTable in sync. This is all currently being done with very minimal code written outside of what currently exists, but it's likely that there'll be good ways to simplify this process.

Let's work through some more use cases that might come up.

### Using iloc

iloc exists both on DataFrames and DataTables. Let's look at two different ways of using iloc to get a new Dataframe with the correct typing info.

In [17]:
# DataFrame.iloc
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')

original_dt = df.ww.dt
df = df.iloc[:,-4:]
df.ww.dt = original_dt[list(df.columns)]
df.ww


Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']
cancelled,boolean,Boolean,[]


In [18]:
# DataTable.iloc
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')

updated_dt = df.ww.iloc[:,-4:]
df = updated_dt.to_dataframe()
df.ww.dt = updated_dt
df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']
cancelled,boolean,Boolean,[]


## In Place Changes

We've looked at various DataTable methods that create a new DataTable, but let's look at one that mutates the DataTable itself. 

Note: Any of this behavior can be changed based on input!

### Pop a column

In [19]:
# DataFrame.pop
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')

# Won't remove from DataTable
series = df.pop('cancelled')
assert 'cancelled' not in df.columns
assert 'cancelled' in df.ww.columns

df.ww.dt = df.ww.dt[list(df.columns)]
assert 'cancelled' not in df.columns
assert 'cancelled' not in df.ww.columns
df.ww #still has relevant types because it was all done in place

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['index']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']


In [20]:
# DataTable.pop
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')

# Won't remove from DataFrame
datacolumn = df.ww.pop('cancelled')
assert 'cancelled' in df.columns
assert 'cancelled' not in df.ww.columns

df.drop('cancelled',axis=1, inplace=True)
assert 'cancelled' not in df.columns
assert 'cancelled' not in df.ww.columns

df.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['index']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,FullName,[]
country,category,CountryCode,['category']
total,float64,Double,['numeric']


### Change column dtype (or logical type) with astype

The use case here is that you want to update the dtype of your DataFrame.

#### Via `astype`

In [21]:
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')

df['cancelled'] = df['cancelled'].astype('int')

assert df['cancelled'].dtype == 'int64'
assert df.ww.columns['cancelled'].dtype == 'boolean'
# It's not actually possible to have 'int64' as a dtype in a Pandas DataTable, so we'll
# convert to 'Int64'
df.ww.dt = df.ww.set_types(logical_types={'cancelled':'Integer'})
assert df['cancelled'].dtype == 'int64'
assert df.ww.columns['cancelled'].dtype == 'Int64'
assert df.ww.columns['cancelled'].dtype != df['cancelled'].dtype 

df['cancelled'] = df['cancelled'].astype('Int64')
assert df['cancelled'].dtype == 'Int64'
assert df.ww.columns['cancelled'].dtype == 'Int64'
assert df.ww.columns['cancelled'].dtype == df['cancelled'].dtype 

We can see above that just using `df.ww.set_types` would have achieved the same result without going through `df.astype`. 

### Setitem 



In [22]:
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')

dc = ww.DataColumn(pd.Series([1]*100))
df.ww.__setitem__('new_col', dc)

assert 'new_col' in df.ww.columns
assert 'new_col' not in df.columns

df['new_col'] = df.ww.columns['new_col'].to_series()
assert 'new_col' in df.ww.columns
assert 'new_col' in df.columns

## Other DataTable Functionality

### Serialization
We can serialize!

In [23]:
df.ww.to_csv('test')

In [24]:
import os
print(os.listdir('test'))
print(os.listdir('test/data'))

['table_description.json', 'data']
['retail_data.csv']


### Statistical Insights

In [25]:
df = pd.read_csv('retail_data.csv', nrows=100)
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'customer_name':'FullName', 'country':'CountryCode'}, 
           index='order_product_id')


In [26]:
df.ww.value_counts()

{'product_id': [{'value': '84029E', 'count': 3},
  {'value': '22752', 'count': 3},
  {'value': '71053', 'count': 3},
  {'value': '22632', 'count': 3},
  {'value': '84029G', 'count': 3},
  {'value': '84406B', 'count': 3},
  {'value': '85123A', 'count': 3},
  {'value': '21730', 'count': 3},
  {'value': '22633', 'count': 3},
  {'value': '21068', 'count': 2}],
 'country': [{'value': 'United Kingdom', 'count': 80},
  {'value': 'France', 'count': 20}]}

## Woodwork DataColumn Accessor

Just like the DataTable Accessor, we have a DataColumn Accessor on pandas Series that gives you DataColumn functionality via `series.ww`.

In [27]:
# Update Logical Type
df = pd.read_csv('retail_data.csv', nrows=100)
customer_name = df['customer_name']

print(customer_name.ww.logical_type)
customer_name.ww.dc = customer_name.ww.set_logical_type('FullName')
customer_name.ww.logical_type

NaturalLanguage


FullName

In [28]:
# Change dtype
quantity = df['quantity']
print(quantity.dtype, quantity.ww.dtype)

quantity.ww.dc = quantity.ww.set_logical_type('NaturalLanguage')
print(quantity.dtype, quantity.ww.dtype)

dc = quantity.ww.dc
quantity = quantity.ww.dc._series
quantity.ww.dc = dc
print(quantity.dtype, quantity.ww.dtype)



int64 Int64
int64 string
string string


# To Do 

### Add Util Functions

```python
def describe(dataframe)
    """
        dataframe (pd.DataFrame): DataFrame with woodwork init
    """
```

The end user usage would look like this:

```python
from woodwork import describe 

dataframe_described = describe(df)
```

### Handle DataTable setters and setitem/getitem

Currently the index and time_index setters will need to be directly added to the Accessor in order to achieve this functionality:
```python
# Setting with index
df.ww.index = 'id'
# __setitem__
df.ww['new_col'] = DataColumn(..)
```
