In [1]:
import polars as pl
import sys
sys.path.append('../src')
from dsds.builder import PipeBuilder, ImputationStartegy, ScalingStrategy
from dsds.transform import EncoderRecord, EncodingStrategy
from dsds.fs import mutual_info_selector

In [2]:
df = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i32,str,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""


In [3]:
# This is an example of an automative feature selection Pipeline.
# You may add more selectors to create a feature sieve.

builder = PipeBuilder()
builder = builder.set_data_and_target(df = df, target="Clicked on Ad")\
    .set_var_removal(threshold=0.5)\
    .set_const_removal()\
    .set_binary_encoding()\
    .set_impute(cols=["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"])\
    .set_impute(cols=["Area Income"], strategy=ImputationStartegy.MEAN)\
    .set_scaling(cols=["Area Income", "Daily Internet Usage"])\
    .set_one_hot_encoding(cols=["One_Hot_Test"])\
    .set_ordinal_auto_encoding(cols=["City", "Country"])\
    .set_col_removal(cols=["Ad Topic Line", "Timestamp"])\
    .set_percentile_encoding(cols=["Daily Internet Usage"])\
    .add_selector(
        mutual_info_selector
        , desc = "Use Mutual Info to select top 5 features"
        , args = {"top_k":5}
    ) 
    # This will select top 3 numerical features by mutual information, together with 
    # other features that cannot be processed by mutual information.
    # There is no need to provide target for the selector here because it is provided at the beginning.

In [4]:
builder.find("encode")

Step at index 2:
Function: binary_encode | Module: dsds.transform | Arguments:
{'cols': None}
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
Step at index 6:
Function: one_hot_encode | Module: dsds.transform | Arguments:
{'cols': ['One_Hot_Test'], 'separator': '_'}
Brief description: Encode string values of given columns into numbers with inferred ordering.
This step is will fit and transform the data.
Step at index 7:
Function: ordinal_auto_encode | Module: dsds.transform | Arguments:
{'cols': ['City', 'Country'], 'default': None}
Brief description: Encode string values of given columns into numbers with inferred ordering.
This step is will fit and transform the data.
Step at index 9:
Function: percentile_encode | Module: dsds.transform | Arguments:
{'cols': ['Daily Internet Usage']}
Brief description: Encode a continuous column by percentiles.
This step is will fit and transform the 

In [5]:
print(builder)

Project name: my_project
Total steps: 11 | Target variable: Clicked on Ad
--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: dsds.transform | Arguments:
{'cols': None}
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: dsds.transform | Arguments:
{'cols': ['Daily Internet Usage', 'Daily Internet Usage Band', 'Area Income Band'], 'strategy': <ImputationStartegy.MEDIAN: 'MEDIAN'>, 'const': 1}
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is 

In [6]:
result1 = builder.build()
result1.head()

INFO:dsds.builder:Starting to build. Total steps: 11.
INFO:dsds.builder:|1/11|: Step: var_removal | is_fit: False | is_selector: False
INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.builder:|1/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|2/11|: Step: constant_removal | is_fit: False | is_selector: False
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:dsds.builder:|2/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|3/11|: Step: binary_encode | is_fit: True | is_selector: False
INFO:dsds.transform:Transforming Test_Binary into a binary column with [0, 1] ...
INFO:dsds.builder:|3/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|4/11|: Step: impute | is_fit: True | is_selector: False
INFO:dsds.builder:|4/11|: Finished in 0.00s | Success

Selected 5 features. There are 0 features the algorithm cannot process. They are also returned.


Daily Internet Usage Band,Daily Internet Usage,Daily Time Spent on Site,Area Income,Area Income Band,Clicked on Ad
f64,u8,f64,f64,f64,i64
12.0,99,68.95,0.511893,12.0,0
9.0,57,80.23,1.005471,13.0,0
11.0,91,69.47,0.358921,11.0,0
12.0,95,74.15,-0.01304,10.0,0
11.0,82,68.37,1.412418,14.0,0


In [7]:
builder.write()

INFO:dsds.builder:No name is specified, using project name (my_project.json) as default.
INFO:dsds.builder:Successfully saved to ./blueprints/my_project.json.


In [8]:
builder.blueprint()

--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: N/A | Arguments:
None
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is will fit and transform the data.
--- Step 5: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEAN imputation method.
This step is will fit and transform the data.
--- Step 6: ---

In [9]:
builder.clear()

In [10]:
builder.blueprint()

No step has been set.


In [11]:
# Reinitialize an empty builder from a blueprint.
builder.from_blueprint("./blueprints/my_project.json")

INFO:dsds.builder:Reading from a blueprint. The builder will reset itself.
INFO:dsds.builder:Successfully read from a blueprint.


<dsds.builder.PipeBuilder at 0x1f76dfb6290>

In [12]:
builder.blueprint()

--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: N/A | Arguments:
None
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is will fit and transform the data.
--- Step 5: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEAN imputation method.
This step is will fit and transform the data.
--- Step 6: ---

In [13]:
df2 = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])


In [14]:
result2 = builder.apply(df2)
result2.head()

INFO:dsds.builder:|1/11|: Performing Step: var_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.builder:|1/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|2/11|: Performing Step: constant_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:dsds.builder:|2/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|3/11|: Performing Step: binary_encode | is_fit: True
INFO:dsds.builder:|3/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|4/11|: Performing Step: impute | is_fit: True
INFO:dsds.builder:|4/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|5/11|: Performing Step: impute | is_fit: True
INFO:dsds.builder:|5/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|6/11|: Performing Step: s

Daily Internet Usage Band,Daily Internet Usage,Daily Time Spent on Site,Area Income,Area Income Band,Clicked on Ad
f64,u8,f64,f64,f64,i64
12.0,99,68.95,0.511893,12.0,0
9.0,57,80.23,1.005471,13.0,0
11.0,91,69.47,0.358921,11.0,0
12.0,95,74.15,-0.01304,10.0,0
11.0,82,68.37,1.412418,14.0,0


In [15]:
# Perfectly reproduces the result.
(result1 == result2).sum()

Daily Internet Usage Band,Daily Internet Usage,Daily Time Spent on Site,Area Income,Area Income Band,Clicked on Ad
u32,u32,u32,u32,u32,u32
1000,1000,1000,1000,1000,1000


In [16]:
builder.blueprint()

--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: N/A | Arguments:
None
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is will fit and transform the data.
--- Step 5: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEAN imputation method.
This step is will fit and transform the data.
--- Step 6: ---

# How to use the builder with Sklearn

In [17]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

In [18]:
# For this to work, builder must be built (fitted)
# This does not work with Pickling at this moment. Need to find a workaround or user
# will have to reinstance the builder outside the Sklearn pipeline.
pipe = Pipeline([('dsds_sklearn_pipe', FunctionTransformer(builder.apply))])

In [19]:
pipe

In [20]:
data = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
# Turn incoming data to Pandas
data = data.to_pandas()


In [21]:
output = pipe.fit_transform(data)

INFO:dsds.builder:|1/11|: Performing Step: var_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.builder:|1/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|2/11|: Performing Step: constant_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:dsds.builder:|2/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|3/11|: Performing Step: binary_encode | is_fit: True
INFO:dsds.builder:|3/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|4/11|: Performing Step: impute | is_fit: True
INFO:dsds.builder:|4/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|5/11|: Performing Step: impute | is_fit: True
INFO:dsds.builder:|5/11|: Finished in 0.00s | Success: True
INFO:dsds.builder:|6/11|: Performing Step: s

In [22]:
(result1 == output).sum() # Exact outputs

Daily Internet Usage Band,Daily Internet Usage,Daily Time Spent on Site,Area Income,Area Income Band,Clicked on Ad
u32,u32,u32,u32,u32,u32
1000,1000,1000,1000,1000,1000
