In [1]:
from dsds.prescreen import (
    remove_if_exists
    , regex_removal
    , var_removal
    , null_removal
    , unique_removal
    , constant_removal
    , date_removal
    , non_numeric_removal
    , get_unique_count
    , get_string_cols
    , get_numeric_cols
)

from dsds.transform import (
    scale
    , impute
    , binary_encode
    , one_hot_encode
    , smooth_target_encode
    , ordinal_encode
    , ordinal_auto_encode
)

from dsds.sample import (
    simple_upsample
)

from dsds.fs import (
    mutual_info_selector
    , mrmr_selector
)

import polars as pl

In [2]:
df = pl.scan_csv("../data/advertising.csv").sort(by="id")

In [3]:
df.limit(10).collect()

id,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
i64,f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,i64,i64,i64,str,i64,str
1,68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12,12.0,1,"""SSS""",0.0,"""A"""
2,80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9,13.0,1,"""SSS""",,"""B"""
3,69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11,11.0,1,"""SSS""",0.0,"""A"""
4,74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12,10.0,1,"""SSS""",,"""B"""
5,68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11,14.0,1,"""SSS""",0.0,"""A"""
6,59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""",1,"""Norway""","""5/19/2016 14:3…",0,"""B""",20,11,11.0,1,"""SSS""",,"""B"""
7,88.91,33,,208.36,"""Enhanced dedic…","""Brandonstad""",0,"""Myanmar""","""1/28/2016 20:5…",0,"""A""",30,10,,1,"""SSS""",0.0,"""A"""
8,66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…",1,"""Australia""","""3/7/2016 1:40""",1,"""A""",40,6,4.0,1,"""SSS""",,"""B"""
9,74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""",1,"""Grenada""","""4/18/2016 9:33…",0,"""A""",30,11,13.0,1,"""SSS""",,"""B"""
10,69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""",1,"""Ghana""","""7/11/2016 1:42…",0,"""A""",20,9,11.0,1,"""SSS""",,"""B"""


In [4]:
input_df = df.lazy()
output = input_df.pipe(var_removal, threshold = 0.5, target = "Clicked on Ad")\
    .pipe(binary_encode)\
    .pipe(ordinal_auto_encode, cols = ["City", "Country"])\
    .pipe(impute, cols=["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"], strategy="median")\
    .pipe(impute, cols=["Area Income"], strategy = "mean")\
    .pipe(scale, cols=["Area Income", "Daily Internet Usage"])\
    .pipe(one_hot_encode, cols= ["One_Hot_Test"])\
    .pipe(remove_if_exists, cols = ["Ad Topic Line", "Timestamp"])\
    .pipe(mutual_info_selector, target = "Clicked on Ad", top_k = 12)


INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.prescreen:The following columns are dropped. ['Ad Topic Line', 'Timestamp'].
Removed a total of 2 columns.
Mutual Info: 100%|██████████| 14/14 [00:00<00:00, 151.75it/s]

Selected 12 features. There are 2 columns the algorithm cannot process. They are also returned.





In [5]:
output.limit(100).collect()

Daily Internet Usage,Daily Internet Usage Band,Daily Time Spent on Site,Age,Age Band,Area Income,Area Income Band,One_Hot_Test_A,id,City,Country,Test_Binary_B,Clicked on Ad,Test_Str_Constant
f64,f64,f64,i64,i64,f64,f64,u8,i64,i64,i64,u8,i64,str
1.74127,12.0,68.95,35,30,0.511893,12.0,1,1,961,215,0,0,"""SSS"""
0.313948,9.0,80.23,31,30,1.005471,13.0,0,2,903,147,1,0,"""SSS"""
1.292598,11.0,69.47,26,20,0.358921,11.0,1,3,111,184,0,0,"""SSS"""
1.507658,12.0,74.15,29,20,-0.01304,10.0,0,4,939,103,1,0,"""SSS"""
1.042496,11.0,68.37,35,30,1.412418,14.0,1,5,805,96,0,0,"""SSS"""
1.069064,11.0,59.99,23,20,0.3571,11.0,0,6,282,158,1,0,"""SSS"""
0.648105,10.0,88.91,33,30,-1.0869e-15,11.0,1,7,46,145,0,0,"""SSS"""
-1.106273,6.0,66.0,48,40,-2.269777,4.0,1,8,671,12,1,1,"""SSS"""
0.94928,11.0,74.53,30,30,1.036854,13.0,1,9,884,82,1,0,"""SSS"""
0.086062,9.0,69.88,20,20,0.049415,11.0,1,10,712,78,1,0,"""SSS"""


In [6]:
output.blueprint.steps

[Step(action='drop', associated_data=['Male', 'Test_Constant', 'Test_BadColumn']),
 Step(action='with_column', associated_data=[<polars.expr.expr.Expr object at 0x0000021A5D5C2E10>]),
 Step(action='drop', associated_data=['Test_Binary']),
 Step(action='map_dict', associated_data=MapDict(left_col='City', ref={'City': shape: (969,)
 Series: '' [str]
 [
 	"Adamsbury"
 	"Adamside"
 	"Adamsstad"
 	"Alanview"
 	"Alexanderfurt"
 	"Alexanderview"
 	"Alexandrafort"
 	"Alexisland"
 	"Aliciatown"
 	"Alvaradoport"
 	"Alvarezland"
 	"Amandafort"
 	…
 	"Williamsside"
 	"Williamstad"
 	"Wilsonburgh"
 	"Wintersfort"
 	"Wongland"
 	"Wrightburgh"
 	"Wrightview"
 	"Yangside"
 	"Youngburgh"
 	"Youngfort"
 	"Yuton"
 	"Zacharystad"
 	"Zacharyton"
 ], 'to': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 

In [7]:
output.blueprint.preserve("blueprint.pickle")

In [8]:
import pickle
f = open("blueprint.pickle", "rb")
pipeline = pickle.loads(f.read())
f.close()
# 
new_lf = pl.scan_csv("../data/advertising.csv").sort(by="id")

In [9]:
pipeline.steps 

[Step(action='drop', associated_data=['Male', 'Test_Constant', 'Test_BadColumn']),
 Step(action='with_column', associated_data=[<polars.expr.expr.Expr object at 0x0000021A5E6DF790>]),
 Step(action='drop', associated_data=['Test_Binary']),
 Step(action='map_dict', associated_data=MapDict(left_col='City', ref={'City': shape: (969,)
 Series: '' [str]
 [
 	"Adamsbury"
 	"Adamside"
 	"Adamsstad"
 	"Alanview"
 	"Alexanderfurt"
 	"Alexanderview"
 	"Alexandrafort"
 	"Alexisland"
 	"Aliciatown"
 	"Alvaradoport"
 	"Alvarezland"
 	"Amandafort"
 	…
 	"Williamsside"
 	"Williamstad"
 	"Wilsonburgh"
 	"Wintersfort"
 	"Wongland"
 	"Wrightburgh"
 	"Wrightview"
 	"Yangside"
 	"Youngburgh"
 	"Youngfort"
 	"Yuton"
 	"Zacharystad"
 	"Zacharyton"
 ], 'to': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 

In [10]:
res = pipeline.apply(new_lf).collect()
res

Daily Internet Usage,Daily Internet Usage Band,Daily Time Spent on Site,Age,Age Band,Area Income,Area Income Band,One_Hot_Test_A,id,City,Country,Test_Binary_B,Clicked on Ad,Test_Str_Constant
f64,f64,f64,i64,i64,f64,f64,u8,i64,i64,i64,u8,i64,str
1.74127,12.0,68.95,35,30,0.511893,12.0,1,1,961,215,0,0,"""SSS"""
0.313948,9.0,80.23,31,30,1.005471,13.0,0,2,903,147,1,0,"""SSS"""
1.292598,11.0,69.47,26,20,0.358921,11.0,1,3,111,184,0,0,"""SSS"""
1.507658,12.0,74.15,29,20,-0.01304,10.0,0,4,939,103,1,0,"""SSS"""
1.042496,11.0,68.37,35,30,1.412418,14.0,1,5,805,96,0,0,"""SSS"""
1.069064,11.0,59.99,23,20,0.3571,11.0,0,6,282,158,1,0,"""SSS"""
0.648105,10.0,88.91,33,30,-1.0869e-15,11.0,1,7,46,145,0,0,"""SSS"""
-1.106273,6.0,66.0,48,40,-2.269777,4.0,1,8,671,12,1,1,"""SSS"""
0.94928,11.0,74.53,30,30,1.036854,13.0,1,9,884,82,1,0,"""SSS"""
0.086062,9.0,69.88,20,20,0.049415,11.0,1,10,712,78,1,0,"""SSS"""


In [11]:
# Perfect
(res.limit(100) == output.limit(100).collect()).sum()

Daily Internet Usage,Daily Internet Usage Band,Daily Time Spent on Site,Age,Age Band,Area Income,Area Income Band,One_Hot_Test_A,id,City,Country,Test_Binary_B,Clicked on Ad,Test_Str_Constant
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
100,100,100,100,100,100,100,100,100,100,100,100,100,100


# More Complicated Pipelines that are Beyond Sklearn

In [12]:
# This is a more advanced example of an automated Feature Selection Pipeline
# Some of the most notable points:
# 1. We can create a feature sieve by piping selectors after one another
# 2. We can upsample/downsample inside the pipe
# You can still pickle the blueprint, and you will have the option to persist the 
# upsampling step or not. By default, it will not be persisted in the pipeline.
# This is great for certain feature selection methods that require more balanced data.
# And the same pipeline can be directly applied to process incoming data for models.

input_df = pl.scan_csv("../data/advertising.csv").sort(by="id")
output = input_df.pipe(var_removal, threshold = 0.5, target = "Clicked on Ad")\
    .pipe(constant_removal)\
    .pipe(simple_upsample, subgroup = pl.col("One_Hot_Test") != 'A', count = 200, persist=True)\
    .pipe(binary_encode)\
    .pipe(one_hot_encode, cols= ["One_Hot_Test"])\
    .pipe(impute, cols=["Area Income", "Daily Internet Usage"], strategy="median")\
    .pipe(scale, cols=["Area Income", "Daily Internet Usage"])\
    .pipe(mrmr_selector, target = "Clicked on Ad", top_k = 12, strategy="fscore")\
    # .pipe(mutual_info_selector, target = "Clicked on Ad", top_k=3)

# Debug the nan issue

INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.


Running fscore to determine feature relevance...
Found 12 total features to select from. Proceeding to select top 12 features.


MRMR, fscore: 100%|██████████| 12/12 [00:00<00:00, 11992.29it/s]

Output is sorted in order of selection (max relevance min redundancy).
Selected 12 features. There are 5 columns the algorithm cannot process. They are also returned.





In [13]:
output.blueprint.steps

[Step(action='drop', associated_data=['Male', 'Test_Constant', 'Test_BadColumn']),
 Step(action='drop', associated_data=['Test_Str_Constant']),
 Step(action='apply_func', associated_data={'module': 'dsds.sample', 'name': 'simple_upsample', 'kwargs': {'subgroup': <polars.expr.expr.Expr object at 0x0000021A5E6DC350>, 'count': 200, 'epsilon': 0.01, 'include': None, 'exclude': None, 'positive': False, 'seed': 42}}),
 Step(action='with_column', associated_data=[<polars.expr.expr.Expr object at 0x0000021A5E6E49D0>]),
 Step(action='drop', associated_data=['Test_Binary']),
 Step(action='with_column', associated_data=[<polars.expr.expr.Expr object at 0x0000021A5E6E4FD0>, <polars.expr.expr.Expr object at 0x0000021A5E6E52D0>, <polars.expr.expr.Expr object at 0x0000021A5E6E5350>]),
 Step(action='drop', associated_data=['One_Hot_Test']),
 Step(action='with_column', associated_data=[<polars.expr.expr.Expr object at 0x0000021A5E6DCC10>, <polars.expr.expr.Expr object at 0x0000021A5DA6C190>]),
 Step(ac