In [1]:
from dsds.prescreen import (
    remove_if_exists
    , regex_removal
    , var_removal
    , null_removal
    , unique_removal
    , constant_removal
    , date_removal
    , non_numeric_removal
    , get_unique_count
    , get_string_cols
)

from dsds.transform import (
    scale
    , impute
    , binary_encode
    , one_hot_encode
    , smooth_target_encode
    , ordinal_encode
    , ordinal_auto_encode
)

from dsds.fs import (
    mutual_info_selector
)

import polars as pl

In [2]:
df = pl.scan_csv("../data/advertising.csv")

In [3]:
df.limit(10).collect()

Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,One_Hot_Test,Age Band,Daily Internet Usage Band,Area Income Band,Test_Constant,Test_Str_Constant,Test_BadColumn,Test_Binary
f64,i64,f64,f64,str,str,i64,str,str,i64,str,i64,f64,f64,i64,str,i64,str
68.95,35,61833.9,256.09,"""Cloned 5thgene…","""Wrightburgh""",0,"""Tunisia""","""3/27/2016 0:53…",0,"""A""",30,12.0,12.0,1,"""SSS""",0.0,"""A"""
80.23,31,68441.85,193.77,"""Monitored nati…","""West Jodi""",1,"""Nauru""","""4/4/2016 1:39""",0,"""B""",30,9.0,13.0,1,"""SSS""",,"""B"""
69.47,26,59785.94,236.5,"""Organic bottom…","""Davidton""",0,"""San Marino""","""3/13/2016 20:3…",0,"""A""",20,11.0,11.0,1,"""SSS""",0.0,"""A"""
74.15,29,54806.18,245.89,"""Triple-buffere…","""West Terrifurt…",1,"""Italy""","""1/10/2016 2:31…",0,"""B""",20,12.0,10.0,1,"""SSS""",,"""B"""
68.37,35,73889.99,225.58,"""Robust logisti…","""South Manuel""",0,"""Iceland""","""6/3/2016 3:36""",0,"""A""",30,11.0,14.0,1,"""SSS""",0.0,"""A"""
59.99,23,59761.56,226.74,"""Sharable clien…","""Jamieberg""",1,"""Norway""","""5/19/2016 14:3…",0,"""B""",20,11.0,11.0,1,"""SSS""",,"""B"""
88.91,33,,208.36,"""Enhanced dedic…","""Brandonstad""",0,"""Myanmar""","""1/28/2016 20:5…",0,"""A""",30,10.0,,1,"""SSS""",0.0,"""A"""
66.0,48,24593.33,131.76,"""Reactive local…","""Port Jefferybu…",1,"""Australia""","""3/7/2016 1:40""",1,"""A""",40,6.0,4.0,1,"""SSS""",,"""B"""
74.53,30,68862.0,221.51,"""Configurable c…","""West Colin""",1,"""Grenada""","""4/18/2016 9:33…",0,"""A""",30,11.0,13.0,1,"""SSS""",,"""B"""
69.88,20,55642.32,183.82,"""Mandatory homo…","""Ramirezton""",1,"""Ghana""","""7/11/2016 1:42…",0,"""A""",20,9.0,11.0,1,"""SSS""",,"""B"""


In [4]:
blueprint = df.collect().lazy().pipe(var_removal, threshold = 0.5, target = "Clicked on Ad")\
    .pipe(constant_removal)\
    .pipe(binary_encode)\
    .pipe(ordinal_auto_encode, cols = ["City", "Country"])\
    .pipe(impute, cols=["Daily Internet Usage", "Daily Internet Usage Band", "Area Income Band"], strategy="median")\
    .pipe(impute, cols=["Area Income"], strategy = "mean")\
    .pipe(scale, cols=["Area Income", "Daily Internet Usage"])\
    .pipe(one_hot_encode, cols= ["One_Hot_Test"])\
    .pipe(remove_if_exists, cols = ["Ad Topic Line", "Timestamp"])\
    .pipe(mutual_info_selector, target = "Clicked on Ad", top_k = 12)



INFO:dsds.prescreen:The following columns are dropped because they have lower than 0.5 variance. ['Male', 'Test_Constant', 'Test_BadColumn'].
Removed a total of 3 columns.
INFO:dsds.prescreen:The following columns are dropped because they are constants. ['Test_Str_Constant'].
Removed a total of 1 columns.
INFO:dsds.prescreen:The following columns are dropped. ['Timestamp', 'Ad Topic Line'].
Removed a total of 2 columns.
Mutual Info: 100%|██████████| 13/13 [00:00<00:00, 156.59it/s]

Selected 12 features. There are 1 columns the algorithm cannot process. They are also returned.





In [5]:
blueprint.collect()

Daily Internet Usage,Daily Internet Usage Band,Daily Time Spent on Site,Age Band,Area Income,Area Income Band,Age,One_Hot_Test_B,Country,Test_Binary,One_Hot_Test_C,One_Hot_Test_A,Clicked on Ad
f64,f64,f64,i64,f64,f64,i64,i32,i64,u8,i32,i32,i64
1.74127,12.0,68.95,30,0.511893,12.0,35,0,215,0,0,1,0
0.313948,9.0,80.23,30,1.005471,13.0,31,1,147,1,0,0,0
1.292598,11.0,69.47,20,0.358921,11.0,26,0,184,0,0,1,0
1.507658,12.0,74.15,20,-0.01304,10.0,29,1,103,1,0,0,0
1.042496,11.0,68.37,30,1.412418,14.0,35,0,96,0,0,1,0
1.069064,11.0,59.99,20,0.3571,11.0,23,1,158,1,0,0,0
0.648105,10.0,88.91,30,-1.0869e-15,11.0,33,0,145,0,0,1,0
-1.106273,6.0,66.0,40,-2.269777,4.0,48,0,12,1,0,1,1
0.94928,11.0,74.53,30,1.036854,13.0,30,0,82,1,0,1,0
0.086062,9.0,69.88,20,0.049415,11.0,20,0,78,1,0,1,0


In [6]:
blueprint.write_json("pipe.json")

In [7]:
f = open("pipe.json", "r")
json_str = f.read()
f.close()

new_lf = pl.scan_csv("../data/advertising.csv").from_json(json_str)

In [8]:
new_lf.collect() == blueprint.collect()

Daily Internet Usage,Daily Internet Usage Band,Daily Time Spent on Site,Age Band,Area Income,Area Income Band,Age,One_Hot_Test_B,Country,Test_Binary,One_Hot_Test_C,One_Hot_Test_A,Clicked on Ad
bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
false,true,true,true,true,true,true,true,true,true,true,true,true
false,true,true,true,true,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true
false,true,true,true,true,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true
false,true,true,true,false,true,true,true,true,true,true,true,true


In [13]:
new_lf.collect()["Daily Internet Usage"].to_list()[:20]

[1.7412697431103392,
 0.3139482114806544,
 1.2925979073493394,
 1.5076580675226823,
 1.042495995837848,
 1.0690635981595813,
 0.6481045199928033,
 -1.1062733574596026,
 0.9492803566572823,
 0.08606231225543946,
 -1.3293496045403654,
 1.1636534236671328,
 -1.5331872430433209,
 0.7825457489829544,
 -0.8360167131523129,
 -0.9028937810656426,
 -1.1600956552665629,
 0.07690107007553121,
 -1.4124878773230316,
 -1.0203867120229642]

In [14]:
blueprint.collect()["Daily Internet Usage"].to_list()[:20]

[1.7412697431103386,
 0.31394821148065377,
 1.2925979073493388,
 1.5076580675226816,
 1.0424959958378472,
 1.0690635981595806,
 0.6481045199928026,
 -1.1062733574596033,
 0.9492803566572816,
 0.08606231225543881,
 -1.329349604540366,
 1.163653423667132,
 -1.5331872430433215,
 0.7825457489829538,
 -0.8360167131523135,
 -0.9028937810656432,
 -1.1600956552665636,
 0.07690107007553057,
 -1.4124878773230323,
 -1.0203867120229648]

In [11]:
# Ok some small precision issues.. FFFFFFFFFFFFFFFFFFFF