In [1]:
import pandas as pd 
import polars as pl
from sklearn.datasets import fetch_openml
from category_encoders import TargetEncoder

display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]

In [2]:
data = fetch_openml(name="house_prices", as_frame=True, parser="auto")
df_pd = data.data[display_cols].copy()
df_pd["target"] = [1 if x > 200000 else 0 for x in data.target]
del data

In [3]:
to_be_encoded = ["MSZoning", 'CentralAir', 'Heating']

In [4]:
df_pd 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,target
0,1,60,RL,65.0,2003,GasA,Y,1
1,2,20,RL,80.0,1976,GasA,Y,0
2,3,60,RL,68.0,2001,GasA,Y,1
3,4,70,RL,60.0,1915,GasA,Y,0
4,5,60,RL,84.0,2000,GasA,Y,1
...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,1999,GasA,Y,0
1456,1457,20,RL,85.0,1978,GasA,Y,1
1457,1458,70,RL,66.0,1941,GasA,Y,1
1458,1459,20,RL,68.0,1950,GasA,Y,0


In [5]:
enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[display_cols], df_pd["target"])

enc.transform(df_pd[display_cols])[to_be_encoded]

Unnamed: 0,MSZoning,CentralAir,Heating
0,0.333623,0.30989,0.294818
1,0.333623,0.30989,0.294818
2,0.333623,0.30989,0.294818
3,0.333623,0.30989,0.294818
4,0.333623,0.30989,0.294818
...,...,...,...
1455,0.333623,0.30989,0.294818
1456,0.333623,0.30989,0.294818
1457,0.333623,0.30989,0.294818
1458,0.333623,0.30989,0.294818


In [6]:
import sys
sys.path.append('../src')
from eda.eda_transformations import smooth_target_encode

In [7]:
df = pl.from_pandas(df_pd)
df.head()

Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,target
i64,i64,str,f64,i64,str,str,i64
1,60,"""RL""",65.0,2003,"""GasA""","""Y""",1
2,20,"""RL""",80.0,1976,"""GasA""","""Y""",0
3,60,"""RL""",68.0,2001,"""GasA""","""Y""",1
4,70,"""RL""",60.0,1915,"""GasA""","""Y""",0
5,60,"""RL""",84.0,2000,"""GasA""","""Y""",1


In [8]:
df_transf, mapping = smooth_target_encode(df, cat_cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20 )

In [9]:
df_transf[to_be_encoded]

MSZoning,CentralAir,Heating
f64,f64,f64
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.333623,0.30989,0.294818
0.226146,0.30989,0.294818
0.333623,0.30989,0.294818


# Time Comparison

In [10]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[display_cols], df_pd["target"])

enc.transform(df_pd[display_cols])[to_be_encoded]

18.6 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%%timeit 
smooth_target_encode(df, cat_cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20 )

56 ms ± 429 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
# On small datasets, the smooth_target_encode is losing in speed.
# But let's test it on bigger, more realistic data set size

In [13]:
df_pd_bigger = pd.concat([df_pd.copy()]*50)
df_pd_bigger.shape

(73000, 8)

In [14]:
df_pl = pl.from_pandas(df_pd_bigger)
df_pl.shape 

(73000, 8)

In [15]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd_bigger[display_cols], df_pd_bigger["target"])

enc.transform(df_pd_bigger[display_cols])[to_be_encoded]

173 ms ± 3.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit 
smooth_target_encode(df_pl, cat_cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20 )

53 ms ± 138 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
pl.when()