In [1]:
import pandas as pd

## Import Accessor (to hit decorator)

In [2]:
import woodwork.accessor

## Create DataFrame

In [3]:
df = pd.read_csv('retail_data.csv', nrows=100)
df.head(5)

Unnamed: 0,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
0,536588,22666,RECIPE BOX PANTRY YELLOW DESIGN,12,2010-12-01 16:49:00,4.8675,Emily Carrillo,United Kingdom,58.41,False
1,536732,22910,PAPER CHAIN KIT VINTAGE CHRISTMAS,6,2010-12-02 12:47:00,4.8675,Tyler Rose,United Kingdom,29.205,False
2,536733,84375,SET OF 20 KIDS COOKIE CUTTERS,12,2010-12-02 12:50:00,3.465,Jacqueline Garcia,United Kingdom,41.58,False
3,536403,22867,HAND WARMER BIRD DESIGN,96,2010-12-01 11:27:00,3.0525,Shawn Carson,Netherlands,293.04,False
4,536753,22632,HAND WARMER RED POLKA DOT,6,2010-12-02 14:07:00,3.0525,Andrea Brown,United Kingdom,18.315,False


### DataFrame Dtypes

In [4]:
df.dtypes

order_id           int64
product_id        object
description       object
quantity           int64
order_date        object
unit_price       float64
customer_name     object
country           object
total            float64
cancelled           bool
dtype: object

## Init Woodwork Accessor

In [5]:
df.ww.init()

__init__ DataTableAccessor


## DataFrame has inferred Logical Types, and semantic tags

In [6]:
df.ww.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']
cancelled,boolean,Boolean,[]


In [7]:
df.dtypes

order_id                  Int64
product_id             category
description              string
quantity                  Int64
order_date       datetime64[ns]
unit_price              float64
customer_name            string
country                  string
total                   float64
cancelled               boolean
dtype: object

## Set Initial DataTable Attributes (time index, name, logical types, create index)

In [8]:
df.ww.init(name='retail_data',
           time_index='order_date', 
           logical_types={'quantity':'Double'}, 
           make_index=True, 
           index='order_product_id')
df.ww.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_product_id,Int64,Integer,['index']
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,float64,Double,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']


## Set Time Index (after reading DataFrame)

In [9]:
df = pd.read_csv('retail_data.csv', nrows=100)

df.ww.dt = df.ww.set_time_index('order_date')
df.ww.types

__init__ DataTableAccessor


Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['time_index']
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']
cancelled,boolean,Boolean,[]


## Set Semantic Tags (after reading DataFrame)

In [10]:
df = pd.read_csv('retail_data.csv', nrows=100)

df.ww.dt = df.ww.add_semantic_tags({'order_date': 'tag_1'})
df.ww.types

__init__ DataTableAccessor


Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,['tag_1']
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']
cancelled,boolean,Boolean,[]


## Select Based on Tags / Logical Types

In [11]:
df.ww.dt.select('tag_1')

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_date,datetime64[ns],Datetime,['tag_1']


In [12]:
df.ww.dt.select(['Categorical', 'NaturalLanguage'])

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]


## Select with normal DataFrame iloc 

In [13]:
df.iloc[[0, 1]]

Unnamed: 0,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
0,536588,22666,RECIPE BOX PANTRY YELLOW DESIGN,12,2010-12-01 16:49:00,4.8675,Emily Carrillo,United Kingdom,58.41,False
1,536732,22910,PAPER CHAIN KIT VINTAGE CHRISTMAS,6,2010-12-02 12:47:00,4.8675,Tyler Rose,United Kingdom,29.205,False


## Sample with normal DataFrame sample

In [14]:
df_sampled = df.sample(5)
print(df_sampled.shape)
df_sampled

(5, 10)


Unnamed: 0,order_id,product_id,description,quantity,order_date,unit_price,customer_name,country,total,cancelled
1,536732,22910,PAPER CHAIN KIT VINTAGE CHRISTMAS,6,2010-12-02 12:47:00,4.8675,Tyler Rose,United Kingdom,29.205,False
3,536403,22867,HAND WARMER BIRD DESIGN,96,2010-12-01 11:27:00,3.0525,Shawn Carson,Netherlands,293.04,False
89,536562,20685,DOORMAT RED RETROSPOT,2,2010-12-01 15:08:00,13.1175,Tammy Washington,United Kingdom,26.235,False
30,536633,22919,HERB MARKER MINT,2,2010-12-02 11:20:00,1.0725,Amy Barr,United Kingdom,2.145,False
55,536595,22041,"RECORD FRAME 7"" SINGLE SIZE",24,2010-12-01 17:24:00,4.2075,Lee Bass,United Kingdom,100.98,False


In [15]:
df_sampled.ww.dt.select(['Categorical'])

__init__ DataTableAccessor


Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
product_id,category,Categorical,['category']


In [16]:
df_sampled.dtypes

order_id                  Int64
product_id             category
description              string
quantity                  Int64
order_date       datetime64[ns]
unit_price              float64
customer_name            string
country                  string
total                   float64
cancelled               boolean
dtype: object

In [17]:
df_sampled.ww.types

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Data Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
order_id,Int64,Integer,['numeric']
product_id,category,Categorical,['category']
description,string,NaturalLanguage,[]
quantity,Int64,Integer,['numeric']
order_date,datetime64[ns],Datetime,[]
unit_price,float64,Double,['numeric']
customer_name,string,NaturalLanguage,[]
country,string,NaturalLanguage,[]
total,float64,Double,['numeric']
cancelled,boolean,Boolean,[]


## Serialization

In [18]:
df.ww.dt.to_csv('test')

In [19]:
import os
print(os.listdir('test'))
print(os.listdir('test/data'))

['table_description.json', 'data']
['retail_data.csv', 'data.csv']


# To Do - Add Util Functions

```python
def describe(dataframe)
    """
        dataframe (pd.DataFrame): DataFrame with woodwork init
    """
```

### End user Usage 

```python
from woodwork import describe 

dataframe_described = describe(df)
```