In [1]:
import pandas as pd 
import polars as pl
import numpy as np
from sklearn.datasets import fetch_openml
from category_encoders import TargetEncoder

display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]

In [2]:
data = fetch_openml(name="house_prices", as_frame=True, parser="auto")
df_pd = data.data[display_cols].copy()
df_pd["MSZoning2"] = df_pd["MSZoning"]
df_pd["target"] = [1 if x > 200000 else 0 for x in data.target]
del data

In [3]:
to_be_encoded = ["MSZoning", 'CentralAir', 'Heating', "MSZoning2"]

In [4]:
df_pd 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
0,1,60,RL,65.0,2003,GasA,Y,RL,1
1,2,20,RL,80.0,1976,GasA,Y,RL,0
2,3,60,RL,68.0,2001,GasA,Y,RL,1
3,4,70,RL,60.0,1915,GasA,Y,RL,0
4,5,60,RL,84.0,2000,GasA,Y,RL,1
...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,1999,GasA,Y,RL,0
1456,1457,20,RL,85.0,1978,GasA,Y,RL,1
1457,1458,70,RL,66.0,1941,GasA,Y,RL,1
1458,1459,20,RL,68.0,1950,GasA,Y,RL,0


In [5]:
enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[to_be_encoded], df_pd["target"])

enc.transform(df_pd[to_be_encoded]).tail(10)

Unnamed: 0,MSZoning,CentralAir,Heating,MSZoning2
1450,0.333623,0.30989,0.294818,0.333623
1451,0.333623,0.30989,0.294818,0.333623
1452,0.045872,0.30989,0.294818,0.045872
1453,0.333623,0.30989,0.294818,0.333623
1454,0.505328,0.30989,0.294818,0.505328
1455,0.333623,0.30989,0.294818,0.333623
1456,0.333623,0.30989,0.294818,0.333623
1457,0.333623,0.30989,0.294818,0.333623
1458,0.333623,0.30989,0.294818,0.333623
1459,0.333623,0.30989,0.294818,0.333623


In [6]:
import sys
sys.path.append('../src')
from dsds.transform import smooth_target_encode # Currently this only works for binary target

In [7]:
df:pl.DataFrame = pl.from_pandas(df_pd)
df.tail(10)

Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
i64,i64,str,f64,i64,str,str,str,i64
1451,90,"""RL""",60.0,1974,"""GasA""","""Y""","""RL""",0
1452,20,"""RL""",78.0,2008,"""GasA""","""Y""","""RL""",1
1453,180,"""RM""",35.0,2005,"""GasA""","""Y""","""RM""",0
1454,20,"""RL""",90.0,2006,"""GasA""","""Y""","""RL""",0
1455,20,"""FV""",62.0,2004,"""GasA""","""Y""","""FV""",0
1456,60,"""RL""",62.0,1999,"""GasA""","""Y""","""RL""",0
1457,20,"""RL""",85.0,1978,"""GasA""","""Y""","""RL""",1
1458,70,"""RL""",66.0,1941,"""GasA""","""Y""","""RL""",1
1459,20,"""RL""",68.0,1950,"""GasA""","""Y""","""RL""",0
1460,20,"""RL""",75.0,1965,"""GasA""","""Y""","""RL""",0


In [8]:
df_transf = smooth_target_encode(df, cols=to_be_encoded
            , target="target"
            , smoothing=10
            , min_samples_leaf=20)

In [9]:
df_transf[to_be_encoded].frame_equal(pl.from_pandas(enc.transform(df_pd[to_be_encoded])))

True

# Time Comparison

In [10]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[to_be_encoded], df_pd["target"])

enc.transform(df_pd[to_be_encoded])

23.9 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%%timeit 
smooth_target_encode(df, cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20
                     , check_binary = False)

76.9 ms ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
# On small datasets, the smooth_target_encode is losing in speed.
# But let's test it on bigger, more realistic data set size

In [13]:
df_pd_bigger = pd.concat([df_pd.copy()]*50)
df_pd_bigger.shape

(73000, 9)

In [14]:
df_pl = pl.from_pandas(df_pd_bigger)
df_pl.shape 

(73000, 9)

In [15]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd_bigger[to_be_encoded], df_pd_bigger["target"])

enc.transform(df_pd_bigger[to_be_encoded])[to_be_encoded]

215 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit 
smooth_target_encode(df_pl, cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20
                     , check_binary = False)

78.6 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
# On bigger datasets, this is much faster