In [1]:
from straw_machine.util import generate_estimator, generate_transformer, generate_pipeline
import pandas as pd

### 0. Generate a dataframe and a UDF function

In [2]:
# dataframe
df = pd.DataFrame({'a':[1,2,3]})
df

Unnamed: 0,a
0,1
1,2
2,3


In [3]:
## function: add num to all df col 
def add(df, num:int, outcol:str):
    df[outcol] = df + num
    return df

**Notable:**

    1. the first args input of udf function should be the dataframe
    2. the return of the udf function is transformed dataframe

### 1. Generate one or many estimators from the UDF function

In [13]:
# estimator 1: 
#   send ['a'] from dataframe to estimator inputs
#   estimator with function add num(1) to all inputs('a') and get outputs columns (['a', 'a+1'], 'a' is the origin column)
estimator1 =generate_estimator(
    name='add1',
    func=add,
    inputs=['a'],
    outputs=['a','a+1'],
    kw_args={
        'num': 1,
        'outcol': 'a+1'
    }
)

# estimator 2:
#   send ['a'] from dataframe to estimator inputs
#   estimator with function add num(2) to all inputs('a') and get outputs columns (['a+1'])

estimator2 =generate_estimator(
    name='add2',
    func=add,
    inputs=['a'],
    outputs=['a+2'],
    kw_args={
        'num': 2,
        'outcol': 'a+2'
    }
)

**Notable**

    1. You can generate different estimators with the same udf function with different inputs and kw_args of udf function. 
    

### 2. Generate transformer from one or many estimators

In [15]:
# generate transformer with name specific and estimators list 
##      name: transformer name
##      estimators: the estimators list (the sequense is not important)
##      remain_other: whether remain other columns that aren't passed into estimator 
t1 = generate_transformer(
    name='trans1',
    estimators=[estimator1, estimator2],
    remain_other=True
)
t1

In [16]:
generate_pipeline([t1])

**Notable**

    1. the different estimators at the same transformer are parallel
    2. you can call the fit_transform to get output dataframe 

In [12]:
t1.fit_transform(df)

Unnamed: 0,a+1,a+2
0,2,3
1,3,4
2,4,5


### 3. Generate pipeline by stacking many transformers

In [8]:
# another udf func:
#      multiply num to all inputs and output with the same names
def mul(df, num:int):
    df = df * num
    return df

# estimator 3
all_mul_estimator3 =generate_estimator(
    name='all_mul',
    func=mul,
    inputs=['a','a+1','a+2'],
    outputs=['a','a+1', 'a+2'],
    kw_args={
        'num': 2,
    }
)

# transformer 2
t2 = generate_transformer(
    name='trans2',
    estimators=[all_mul_estimator3]
)
t2

#### 3.1 stacking transformers with simply + (add) function

In [9]:
# stacking transformers with simply + (__add__) function
pl = t1+t2
pl

#### 3.2 stacking transformers with function generate_pipeline


In [10]:
# stacking transformers with function generate_pipeline
pl = generate_pipeline([t1,t2])
pl

In [11]:
# apply the pipeline to df (+1 ,+2 and then *2)
pl.fit_transform(df)

Unnamed: 0,a,a+1,a+2
0,2,4,6
1,4,6,8
2,6,8,10


**Notable**

    1. The transformers in pipeline are not applied parallel but are applied in sequence.
    2. If the transformers will change the name of output columns, please write the right inputs & outputs in estimator

In [12]:
# if you want to apply t2 (*2) again to pipeline, you just to need to
#     1. rename it as another name
#     2. add to pipeline
t2.rename('trans3')
pl2 = (pl+t2)
pl2

In [13]:
pl2.fit_transform(df)

Unnamed: 0,a,a+1,a+2
0,4,8,12
1,8,12,16
2,12,16,20


### Save pipeline / transformer

If you want to reuse the pipeline after your effort to build a comlicated pipeline, all you need to do is save it to disk 

In [18]:
pl2.save('./demo.pkl')

In [19]:
# next time load it with joblib
import joblib
loadpl = joblib.load('./demo.pkl')
loadpl

In [20]:
loadpl.fit_transform(df)

Unnamed: 0,a,a+1,a+2
0,4,8,12
1,8,12,16
2,12,16,20


**Notable**

     1. the save pipeline can make the deployment code of pipeline more clean and easy