![Woodwork](docs/source/images/woodwork.svg)

### Woodwork is a library that helps with data typing of 2-dimensional tabular data structures.

## Load Data

In [1]:
import pandas as pd

data = pd.read_csv("https://featuretools-static.s3.amazonaws.com/online-retail-logs-2018-08-28.csv",
                   nrows=100)
# create index column
data.insert(0, 'order_product_id', range(data.shape[0]))
data.head(3)

Unnamed: 0,order_product_id,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.2075,Andrea Brown,United Kingdom,25.245,False
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,4.5375,Andrea Brown,United Kingdom,36.3,False


## Creating a DataTable

In [2]:
import woodwork as ww

dt = ww.DataTable(data, name="retail", index='order_product_id', time_index='order_date')
dt.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,WholeNumber,{index}
order_id,Int64,WholeNumber,{numeric}
product_id,category,Categorical,{category}
description,string,NaturalLanguage,{}
quantity,Int64,WholeNumber,{numeric}
order_date,datetime64[ns],Datetime,{time_index}
unit_price,float64,Double,{numeric}
customer_name,string,NaturalLanguage,{}
country,string,NaturalLanguage,{}
total,float64,Double,{numeric}


## Updating Logical Types

In [3]:
dt.set_logical_types({
    'order_id': 'Categorical',
    'customer_name': 'FullName',
    'country': 'CountryCode'
})
dt.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,WholeNumber,{index}
order_id,category,Categorical,{category}
product_id,category,Categorical,{category}
description,string,NaturalLanguage,{}
quantity,Int64,WholeNumber,{numeric}
order_date,datetime64[ns],Datetime,{time_index}
unit_price,float64,Double,{numeric}
customer_name,string,FullName,{}
country,category,CountryCode,{category}
total,float64,Double,{numeric}


## Adding Semantic Tags

In [4]:
dt.set_semantic_tags({'order_date':'date_of_birth'})
dt.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,WholeNumber,{index}
order_id,category,Categorical,{category}
product_id,category,Categorical,{category}
description,string,NaturalLanguage,{}
quantity,Int64,WholeNumber,{numeric}
order_date,datetime64[ns],Datetime,"{date_of_birth, time_index}"
unit_price,float64,Double,{numeric}
customer_name,string,FullName,{}
country,category,CountryCode,{category}
total,float64,Double,{numeric}


## Selecting Columns

### Select with Logical Types

In [5]:
numeric_dt = dt.select_ltypes(['WholeNumber', 'Double'])
numeric_dt.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,WholeNumber,{index}
quantity,Int64,WholeNumber,{numeric}
unit_price,float64,Double,{numeric}
total,float64,Double,{numeric}


### Select with Semantic Tags

In [6]:
category_dt = dt.select_semantic_tags('category')
category_dt.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,category,Categorical,{category}
product_id,category,Categorical,{category}
country,category,CountryCode,{category}


### Select with Both

In [7]:
mixed_dt = dt.select(['Boolean', 'date_of_birth'])
mixed_dt.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_date,datetime64[ns],Datetime,"{date_of_birth, time_index}"
cancelled,boolean,Boolean,{}


## Statistics

In [8]:
dt.describe()

Unnamed: 0,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
physical_type,category,category,string,Int64,datetime64[ns],float64,string,category,float64,boolean
logical_type,Categorical,Categorical,NaturalLanguage,WholeNumber,Datetime,Double,FullName,CountryCode,Double,Boolean
semantic_tags,{category},{category},{},{numeric},"{date_of_birth, time_index}",{numeric},{},{category},{numeric},{}
count,100,100,100,100,100,100,100,100,100,100
nunique,14,73,,16,12,26,,2,41,
nan_count,0,0,0,0,0,0,0,0,0,0
mean,,,,13.1,2010-12-01 09:01:37.199999744,5.40606,,,51.5283,
mode,536370,21730,CREAM CUPID HEARTS COAT HANGER,6,2010-12-01 08:45:00,4.2075,Andrea Brown,United Kingdom,25.245,False
std,,,,16.8879,,4.30838,,,74.3198,
min,,,,2,2010-12-01 08:26:00,0.693,,,10.494,


## Retrieve Data

In [9]:
dataframe = dt.to_pandas()
dataframe.head(3)

Unnamed: 0,order_product_id,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.2075,Andrea Brown,United Kingdom,25.245,False
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,5.5935,Andrea Brown,United Kingdom,33.561,False
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,4.5375,Andrea Brown,United Kingdom,36.3,False


## 🎉🎉  Thanks to Tamar, Nate !!  🎉🎉

## 🎊🎊 Dylan, Roy, Max, Steve as well !! 🎊🎊

# [Documentation](https://feature-labs-inc-datatables.readthedocs-hosted.com/en/latest/)
- URL will be **woodwork.alteryx.com** [WIP]

# Next Steps

### Kickoff with EvalML Today

### Future Work

##### Featuretools Single Table DFS 

```python
import featuretools as ft

dt = ww.DataTable(data)

feature_matrix, features = ft.dfs(datatable=dt)
```

##### Multiple Datatables and Relationships

```python
dt.add_semantic_tags({"product_id": "foreign_key"})
```

##### Serialization and Deserialization
```python
dt.to_csv(path='output') 
# creates output/data/datatable.csv AND output/datatable.json

dt.to_pickle()
dt.to_avro() # used in Predictive Server
dt.to_parquet()

```

##### Visualization

```python
dt.plot(include=['column_1', 'column_2']) # -> show relationship plot based on Logical Types
```

##### Koalas, Dask based DataTables

```python
import dask.dataframe as dd

dask_dataframe = dd.read_csv('retail.csv')

dt = ww.DataTable(dask_dataframe)
```