In [1]:
import polars as pl
import sys
sys.path.append('../src')
from dsds.builder import PipeBuilder, ImputationStartegy, ScalingStrategy
from dsds.transform import EncoderRecord, EncodingStrategy

In [2]:
df = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i32,str,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""


In [3]:
builder = PipeBuilder()
builder = builder.set_data_and_target(df = df, target="Clicked on Ad")\
    .set_var_removal(threshold=0.5)\
    .set_const_removal()\
    .set_binary_encoding()\
    .set_impute(cols=["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"])\
    .set_impute(cols=["Area Income"], strategy=ImputationStartegy.MEAN)\
    .set_scaling(cols=["Area Income", "Daily Internet Usage"])\
    .set_one_hot_encoding(cols=["One_Hot_Test"])\
    .set_ordinal_auto_encoding(cols=["City", "Country"])\
    .set_col_removal(cols=["Ad Topic Line", "Timestamp"])\
    .set_percentile_encoding(cols=["Daily Internet Usage"])
    

In [4]:
print(builder)

Project name: my_project
Total steps: 10 | Ready to build: True | Target variable: Clicked on Ad
--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: dsds.transform | Arguments:
{'cols': None}
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: dsds.transform | Arguments:
{'cols': ['Daily Internet Usage', 'Daily Internet Usage Band', 'Area Income Band'], 'strategy': <ImputationStartegy.MEDIAN: 'MEDIAN'>, 'const': 1}
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputatio

In [5]:
result1 = builder.build()
result1.head()

INFO:dsds.builder:Starting to build. Total steps: 10.
INFO:dsds.builder:|1/10|: Executed Step: var_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.builder:|1/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|2/10|: Executed Step: constant_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:dsds.builder:|2/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|3/10|: Executed Step: binary_encode | is_fit: True
INFO:dsds.transform:Transforming Test_Binary into a binary column with [0, 1] ...
INFO:dsds.builder:|3/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|4/10|: Executed Step: impute | is_fit: True
INFO:dsds.builder:|4/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|5/10|: Executed Step: 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Country,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,u8,u32,u32,i64,u8,u8,u8,i64,f64,f64,u8
68.95,35,0.511893,99,961,215,0,1,0,0,30,12.0,12.0,0
80.23,31,1.005471,57,903,147,0,0,1,0,30,9.0,13.0,1
69.47,26,0.358921,91,111,184,0,1,0,0,20,11.0,11.0,0
74.15,29,-0.01304,95,939,103,0,0,1,0,20,12.0,10.0,1
68.37,35,1.412418,82,805,96,0,1,0,0,30,11.0,14.0,0


In [6]:
builder.write()

INFO:dsds.builder:No name is specified, using project name (my_project.json) as default.
INFO:dsds.builder:Successfully saved to ./blueprints/my_project.json.


In [7]:
builder.blueprint()

--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: N/A | Arguments:
None
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is will fit and transform the data.
--- Step 5: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEAN imputation method.
This step is will fit and transform the data.
--- Step 6: ---

In [8]:
builder.clear()

In [9]:
builder.blueprint()

No step has been set.


In [10]:
builder.from_blueprint("./blueprints/my_project.json")

INFO:dsds.builder:Reading from a blueprint. The builder will reset itself.
INFO:dsds.builder:Successfully read from a blueprint.


In [11]:
builder.blueprint()

--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: N/A | Arguments:
None
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is will fit and transform the data.
--- Step 5: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEAN imputation method.
This step is will fit and transform the data.
--- Step 6: ---

In [12]:
df2 = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])


In [13]:
result2 = builder.apply(df2)
result2.head()

INFO:dsds.builder:|1/10|: Performing Step: var_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.builder:|1/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|2/10|: Performing Step: constant_removal | is_fit: False
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:dsds.builder:|2/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|3/10|: Performing Step: binary_encode | is_fit: True
INFO:dsds.builder:|3/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|4/10|: Performing Step: impute | is_fit: True
INFO:dsds.builder:|4/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|5/10|: Performing Step: impute | is_fit: True
INFO:dsds.builder:|5/10|: Finished in 0.00s | Success: True
INFO:dsds.builder:|6/10|: Performing Step: s

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Country,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,u8,i64,i64,i64,u8,u8,u8,i64,f64,f64,i64
68.95,35,0.511893,99,961,215,0,1,0,0,30,12.0,12.0,0
80.23,31,1.005471,57,903,147,0,0,1,0,30,9.0,13.0,1
69.47,26,0.358921,91,111,184,0,1,0,0,20,11.0,11.0,0
74.15,29,-0.01304,95,939,103,0,0,1,0,20,12.0,10.0,1
68.37,35,1.412418,82,805,96,0,1,0,0,30,11.0,14.0,0


In [14]:
# Perfectly reproduces the result.
(result1 == result2).sum()

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Country,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000


In [15]:
builder.blueprint()

--- Step 1: ---
Function: var_removal | Module: dsds.prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: dsds.prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: N/A | Arguments:
None
Brief description: Automatically detect binary columns and turn them into [0,1] values by their order.
This step is will fit and transform the data.
--- Step 4: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEDIAN imputation method.
This step is will fit and transform the data.
--- Step 5: ---
Function: impute | Module: N/A | Arguments:
None
Brief description: Impute using specified the ImputationStartegy.MEAN imputation method.
This step is will fit and transform the data.
--- Step 6: ---