In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter


In [2]:
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 10, n_informative = 5, n_redundant = 5)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
df.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.322708,-1.365312,0.207284,0.033649,2.276149,2.031665,-1.918109,-1.421197,-0.930567,-1.314837
1,1.994452,-2.281401,-2.205605,0.764004,-1.842724,1.452908,1.305384,-0.72655,1.32201,0.996695
0,0.114551,1.667566,0.360641,0.443316,-2.956367,-1.976026,2.513012,2.370084,0.644884,1.43369
0,-3.552024,-2.05964,-3.664615,0.96977,6.943418,0.059459,-0.43525,-2.961083,-1.556114,1.082131
0,-0.254937,-0.601362,-0.084797,0.881116,3.100794,0.708959,-0.631499,-0.924934,-0.407681,-1.701868


## Comparisons

This notebook compares results and performance between the dsds package, sklearn and some other packages for feature selection and some other transformations common in the data science pipeline.

### Methods Compared:
1. Scaling and Imputation
2. Fscore
3. Mutual Information Score
4. MRMR feature selection strategies
5. Power Transform

You may restart the kernel after each section. But remember to rerun the cells above. If you are concerned about memory usage when running this notebook, go to the end and run the gc cell.

# Scaling and Imputation

In [4]:
import dsds.transform as t

In [5]:
features = df.columns
features.remove("target")

In [6]:
scaled = t.scale(df, cols=features, strategy="standard")
scaled.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.09254,-0.389583,0.107522,0.016957,0.573029,0.446283,-0.697402,-0.632834,-0.533205,-0.500247
1,0.796765,-0.858435,-1.081281,0.464247,-0.611884,0.234959,0.932315,-0.151914,0.753089,0.919369
0,-0.203272,1.162635,0.183079,0.267849,-0.932256,-1.017059,1.542862,1.991961,0.366429,1.187748
0,-2.153755,-0.744939,-1.800119,0.590264,1.915704,-0.273835,0.052294,-1.698935,-0.890413,0.97184
0,-0.399826,0.001404,-0.036384,0.53597,0.810262,-0.036681,-0.046924,-0.28926,-0.23462,-0.73794


In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
# The difference in result is caused by using ddof = 1 for sample variance in dsds
# and using ddof = 0 in sklearn.

# Long and convoluted code just to do some scaling...
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
# scaled2[:5, :] # scaled2 is a numpy matrix
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]
scaled2.head()

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,target
0,-0.09254,-0.389583,0.107522,0.016957,0.573029,0.446283,-0.697402,-0.632834,-0.533205,-0.500247,1
1,0.796765,-0.858435,-1.081281,0.464247,-0.611884,0.234959,0.932315,-0.151914,0.753089,0.919369,1
2,-0.203272,1.162635,0.183079,0.267849,-0.932256,-1.017059,1.542862,1.991961,0.366429,1.187748,0
3,-2.153755,-0.744939,-1.800119,0.590264,1.915704,-0.273835,0.052294,-1.698935,-0.890413,0.97184,0
4,-0.399826,0.001404,-0.036384,0.53597,0.810262,-0.036681,-0.046924,-0.28926,-0.23462,-0.73794,0


In [9]:
%%timeit
scaled = t.scale(df, cols=features, strategy="standard")

2.49 ms ± 44.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

15.3 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [11]:
import dsds.transform as t
t.impute(df, cols=features, strategy="median").head(3) 

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.322708,-1.365312,0.207284,0.033649,2.276149,2.031665,-1.918109,-1.421197,-0.930567,-1.314837
1,1.994452,-2.281401,-2.205605,0.764004,-1.842724,1.452908,1.305384,-0.72655,1.32201,0.996695
0,0.114551,1.667566,0.360641,0.443316,-2.956367,-1.976026,2.513012,2.370084,0.644884,1.43369


In [12]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)
imputed.head(3)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1.0,0.322708,-1.365312,0.207284,0.033649,2.276149,2.031665,-1.918109,-1.421197,-0.930567,-1.314837
1,1.0,1.994452,-2.281401,-2.205605,0.764004,-1.842724,1.452908,1.305384,-0.72655,1.32201,0.996695
2,0.0,0.114551,1.667566,0.360641,0.443316,-2.956367,-1.976026,2.513012,2.370084,0.644884,1.43369


In [13]:
%%timeit
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

129 ms ± 1.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [14]:
%%timeit
t.impute(df, cols=features, strategy="median")

1.17 ms ± 3.89 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Fscore

In [15]:
import dsds.fs as fs # fs = feature_selection
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression

In [16]:
fs._f_score(df, target=target, num_list = features)

array([7.60205007e+03, 1.03499565e+04, 3.83776974e-01, 1.86230775e-01,
       6.17015927e+02, 9.68595220e+03, 8.16841090e+03, 1.33735354e+04,
       1.86739150e+00, 1.02356242e+04])

In [17]:
# The more core, the bigger the difference. Data here is not big enough to show the difference
start = perf_counter()
res = fs.f_classif(df, target=target)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.00s in computing Fscore.


feature,f_value,p_value
str,f64,f64
"""column_0""",7602.050073,0.0
"""column_1""",10349.956513,0.0
"""column_2""",0.383777,0.53559
"""column_3""",0.186231,0.666073
"""column_4""",617.015927,8.6255e-136
"""column_5""",9685.952196,0.0
"""column_6""",8168.410901,0.0
"""column_7""",13373.535375,0.0
"""column_8""",1.867392,0.171777
"""column_9""",10235.624156,0.0


In [18]:
start = perf_counter()
f, pv = f_classif(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.02s in computing Fscore.


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,feature,f_value,p_value
0,column_0,7602.050073,0.0
1,column_1,10349.956513,0.0
2,column_2,0.383777,0.5355902
3,column_3,0.186231,0.6660733
4,column_4,617.015927,8.6255e-136
5,column_5,9685.952196,0.0
6,column_6,8168.410901,0.0
7,column_7,13373.535375,0.0
8,column_8,1.867392,0.1717772
9,column_9,10235.624156,0.0


In [19]:
start = perf_counter()
f, pv = f_regression(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start: .2f}s in computing Fscore.")
res.head(10)

Spent  0.01s in computing Fscore.


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,feature,f_value,p_value
0,column_0,7602.050073,0.0
1,column_1,10349.956513,0.0
2,column_2,0.383777,0.5355902
3,column_3,0.186231,0.6660733
4,column_4,617.015927,8.6255e-136
5,column_5,9685.952196,0.0
6,column_6,8168.410901,0.0
7,column_7,13373.535375,0.0
8,column_8,1.867392,0.1717772
9,column_9,10235.624156,0.0


# Mutual Information Score

In [20]:
# Vastly faster than sklearn. Finished in 0.7s in this run
fs.mutual_info(df, target=target, conti_cols=features).sort(by="estimated_mi", descending=True).limit(10)

Mutual Info: 100%|██████████| 10/10 [00:00<00:00, 12.89it/s]


feature,estimated_mi
str,f64
"""column_5""",0.134916
"""column_1""",0.1307
"""column_6""",0.093667
"""column_7""",0.065708
"""column_9""",0.058458
"""column_0""",0.050345
"""column_3""",0.034937
"""column_4""",0.024143
"""column_8""",0.01585
"""column_2""",0.010962


In [21]:
# A wrapper for more apples to apples comparison
def estimate_mi_sklearn(df:pd.DataFrame, cols:list[str], target:str, k=3, random_state:int=42):
    mi_estimates = mutual_info_classif(df[cols], df[target]
                        , n_neighbors=k, random_state=random_state, discrete_features=False)

    return pl.from_records([cols, mi_estimates], schema=["feature", "estimated_mi"]).sort("estimated_mi", descending=True)

In [22]:
# The reason sklearn's impl is slow is that it did not turn on multithreading for KDtrees.
# Sklearn also did not provide an option to turn it on, despite the fact that sklearn's KDtrees
# does have this functionality. Finished in 4.4s in this run
estimate_mi_sklearn(df_pd, cols=features, target=target).limit(10)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


feature,estimated_mi
str,f64
"""column_5""",0.134915
"""column_1""",0.1307
"""column_6""",0.093669
"""column_7""",0.065709
"""column_9""",0.058458
"""column_0""",0.050345
"""column_3""",0.034933
"""column_4""",0.024143
"""column_8""",0.015851
"""column_2""",0.010965


# MRMR Feature selection Strategy

In [23]:
from mrmr import mrmr_classif # This is currently the most starred MRMR Python package on github

In [24]:
# Need to wrap it so that we get apples to apples comparison
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    features = list(df.columns)
    features.remove(target)
    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [25]:
mrmr_package(df_pd, "target", 50)

100%|██████████| 10/10 [00:05<00:00,  1.68it/s]

Spent 8.77s to compute mrmr.





['column_7',
 'column_0',
 'column_9',
 'column_6',
 'column_1',
 'column_5',
 'column_4',
 'column_8',
 'column_2',
 'column_3']

In [26]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=False) # mrmr from dsds package
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

INFO:dsds.fs:Running fscore to determine feature relevance...
INFO:dsds.fs:Found 10 total features to select from. Proceeding to select top 10 features.
MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 3332.25it/s]

Output is sorted in order of selection (max relevance min redundancy).
Spent 0.02s in computing.





['column_7',
 'column_0',
 'column_9',
 'column_6',
 'column_1',
 'column_5',
 'column_4',
 'column_8',
 'column_2',
 'column_3']

In [27]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=True)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

INFO:dsds.fs:Running fscore to determine feature relevance...
INFO:dsds.fs:Found 10 total features to select from. Proceeding to select top 10 features.
MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 769.09it/s]

Output is sorted in order of selection (max relevance min redundancy).
Spent 0.02s in computing.





['column_7',
 'column_0',
 'column_9',
 'column_6',
 'column_1',
 'column_5',
 'column_4',
 'column_8',
 'column_2',
 'column_3']

# Power Transform

In [28]:
start = perf_counter()
res_eager = t.power_transform(df, cols=features, strategy="yeo_johnson")
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res_eager.head() 

Spent 0.21s in computing.


target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.321941,-1.466481,0.207136,0.0337,2.302898,2.203265,-1.891525,-1.462403,-0.921433,-1.402246
1,1.973602,-2.529884,-2.216911,0.785923,-1.824261,1.548389,1.319252,-0.738849,1.339324,0.946991
0,0.114448,1.537318,0.360211,0.451238,-2.915753,-1.827537,2.555675,2.275477,0.649611,1.340007
0,-3.606981,-2.267986,-3.690671,1.003674,7.102352,0.059672,-0.433403,-3.105273,-1.533497,1.024665
0,-0.255427,-0.623899,-0.084823,0.90959,3.145446,0.734925,-0.6278,-0.944059,-0.405693,-1.839853


In [29]:
from sklearn.preprocessing import power_transform

In [30]:
# Sklearn with Pandas

start = perf_counter()
transformed = power_transform(df_pd[features], method = "yeo-johnson", standardize=False)
end = perf_counter()
df_pd[features] = transformed
print(f"Spent {end - start:.2f}s in computing.")
df_pd.head()


Spent 0.70s in computing.


Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1,0.321941,-1.466481,0.207136,0.0337,2.302898,2.203265,-1.891525,-1.462403,-0.921433,-1.402246
1,1,1.973602,-2.529884,-2.216911,0.785923,-1.824261,1.548389,1.319252,-0.738849,1.339324,0.946991
2,0,0.114448,1.537318,0.360211,0.451238,-2.915753,-1.827537,2.555675,2.275477,0.649611,1.340007
3,0,-3.606981,-2.267985,-3.690671,1.003674,7.102352,0.059672,-0.433403,-3.105273,-1.533497,1.024665
4,0,-0.255427,-0.623899,-0.084823,0.90959,3.145446,0.734925,-0.6278,-0.944059,-0.405693,-1.839853


# GC

In [31]:
import gc 
gc.collect()

131