In [1]:
import polars as pl
import sys
sys.path.append('../src')
from eda.eda_builder import TransformationBuilder, ImputationStartegy, ScalingStrategy

In [2]:
df = pl.read_csv("../data/advertising.csv").with_columns([
    ((pl.col("Age") // 10) * 10).alias("Age Band"),
    (pl.col("Daily Internet Usage") // 20).alias("Daily Internet Usage Band"),
    (pl.col("Area Income") // 5000).alias("Area Income Band"),
    pl.lit(1).alias("Test_Constant"),
    pl.lit("SSS").alias("Test_Str_Constant"),
    pl.when(pl.col("Male") == 0).then(0).otherwise(None).alias("Test_BadColumn"),
    pl.when(pl.col("Male") == 0).then("A").otherwise("B").alias("Test_Binary"),
])
target = "Clicked on Ad"
df.head() 

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i32,str,i32,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""


In [3]:
builder = TransformationBuilder(target="Clicked on Ad")
builder = builder.set_var_removal(threshold=0.5)\
    .set_const_removal()\
    .set_binary_encoding()\
    .set_impute(cols=["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"])\
    .set_impute(cols=["Area Income"], strategy=ImputationStartegy.MEAN)\
    .set_scaling(cols=["Area Income", "Daily Internet Usage"])\
    .set_one_hot_encoding(cols=["One_Hot_Test"])\
    .set_ordinal_auto_encoding(cols=["City", "Country"])\
    .set_col_removal(cols=["Ad Topic Line", "Timestamp"])\
    

In [4]:
builder.build(df)

INFO:eda.eda_builder:Starting to build. Total steps: 9.
INFO:eda.eda_builder:|1/9|: Step: var_removal
INFO:eda.eda_prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:eda.eda_builder:|2/9|: Step: constant_removal
INFO:eda.eda_prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:eda.eda_builder:|3/9|: Step: binary_encode
INFO:eda.eda_transformations:Transforming Clicked on Ad into a binary column with [0, 1] ...
INFO:eda.eda_transformations:Transforming Test_Binary into a binary column with [0, 1] ...
INFO:eda.eda_builder:|4/9|: Step: impute
INFO:eda.eda_builder:|5/9|: Step: impute
INFO:eda.eda_builder:|6/9|: Step: scale
INFO:eda.eda_builder:|7/9|: Step: one_hot_encode
INFO:eda.eda_builder:|8/9|: Step: ordinal_auto_encode
INFO:eda.eda_builder:|9/9|: Step: remove_if_exists
INFO:eda.eda_prescreen:The

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Country,Clicked on Ad,One_Hot_Test_A,One_Hot_Test_B,One_Hot_Test_C,Age Band,Daily Internet Usage Band,Area Income Band,Test_Binary
f64,i64,f64,f64,u32,u32,u8,u8,u8,u8,i64,f64,f64,u8
68.95,35,0.511893,1.74127,961,215,0,1,0,0,30,12.0,12.0,0
80.23,31,1.005471,0.313948,903,147,0,0,1,0,30,9.0,13.0,1
69.47,26,0.358921,1.292598,111,184,0,1,0,0,20,11.0,11.0,0
74.15,29,-0.01304,1.507658,939,103,0,0,1,0,20,12.0,10.0,1
68.37,35,1.412418,1.042496,805,96,0,1,0,0,30,11.0,14.0,0
59.99,23,0.3571,1.069064,282,158,0,0,1,0,20,11.0,11.0,1
88.91,33,-1.0869e-15,0.648105,46,145,0,1,0,0,30,10.0,11.0,0
66.0,48,-2.269777,-1.106273,671,12,1,1,0,0,40,6.0,4.0,1
74.53,30,1.036854,0.94928,884,82,0,1,0,0,30,11.0,13.0,1
69.88,20,0.049415,0.086062,712,78,0,1,0,0,20,9.0,11.0,1


In [5]:
builder.write()

INFO:eda.eda_builder:No name is specified, using project name as default.


In [6]:
builder.blueprint()

"--- Step 1: ---\nFunction: var_removal | Module: eda.eda_prescreen | Arguments:\n{'threshold': 0.5, 'target': 'Clicked on Ad'}\nBrief description: Remove columns with less than 0.5 variance. (Not recommended.)\n--- Step 2: ---\nFunction: constant_removal | Module: eda.eda_prescreen | Arguments:\n{'include_null': True}\nBrief description: Remove columns that are constants.\n--- Step 3: ---\nFunction: binary_encode | Module: None | Arguments:\nNone\nBrief description: \nThis step is a transformation step.\n--- Step 4: ---\nFunction: impute | Module: None | Arguments:\nNone\nBrief description: \nThis step is a transformation step.\n--- Step 5: ---\nFunction: impute | Module: None | Arguments:\nNone\nBrief description: \nThis step is a transformation step.\n--- Step 6: ---\nFunction: scale | Module: None | Arguments:\nNone\nBrief description: \nThis step is a transformation step.\n--- Step 7: ---\nFunction: one_hot_encode | Module: None | Arguments:\nNone\nBrief description: \nThis step i

In [7]:
print(builder.blueprint())

--- Step 1: ---
Function: var_removal | Module: eda.eda_prescreen | Arguments:
{'threshold': 0.5, 'target': 'Clicked on Ad'}
Brief description: Remove columns with less than 0.5 variance. (Not recommended.)
--- Step 2: ---
Function: constant_removal | Module: eda.eda_prescreen | Arguments:
{'include_null': True}
Brief description: Remove columns that are constants.
--- Step 3: ---
Function: binary_encode | Module: None | Arguments:
None
Brief description: 
This step is a transformation step.
--- Step 4: ---
Function: impute | Module: None | Arguments:
None
Brief description: 
This step is a transformation step.
--- Step 5: ---
Function: impute | Module: None | Arguments:
None
Brief description: 
This step is a transformation step.
--- Step 6: ---
Function: scale | Module: None | Arguments:
None
Brief description: 
This step is a transformation step.
--- Step 7: ---
Function: one_hot_encode | Module: None | Arguments:
None
Brief description: 
This step is a transformation step.
--- Step