In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter


In [2]:
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 10, n_informative = 5, n_redundant = 5)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
df.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.298269,-0.243838,1.000231,0.250288,-0.850341,-0.335285,-0.774818,0.630436,0.082388,-0.397133
0,0.342461,1.261295,1.189653,5.469217,-0.330301,2.873605,-2.769486,-0.562839,-3.155674,-0.199987
1,0.310876,-2.109248,-0.639685,-0.930077,-2.753375,1.072716,-2.559143,2.1285,1.831962,-1.168917
0,0.183092,0.37344,1.554368,4.383568,-1.565337,2.311002,-3.457305,0.644087,-2.052496,-0.863394
1,1.984526,-2.025175,-1.068103,-1.500969,-1.059816,0.660551,-1.753644,1.912164,0.884035,-2.244608


## Comparisons

This notebook compares results and performance between the dsds package, sklearn and some other packages for feature selection and some other transformations common in the data science pipeline.

### Methods Compared:
1. Scaling and Imputation
2. Fscore
3. Mutual Information Score
4. MRMR feature selection strategies
5. Power Transform

You may restart the kernel after each section. But remember to rerun the cells above. If you are concerned about memory usage when running this notebook, go to the end and run the gc cell.

# Scaling and Imputation

In [4]:
import dsds.transform as t

In [5]:
features = df.columns
features.remove("target")

In [6]:
scaled = t.scale(df, cols=features, strategy="standard")
scaled.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.729039,-0.153744,0.728232,-0.266937,-0.374598,-0.715868,0.176412,-0.038076,0.378098,0.475553
0,-0.281205,0.814441,0.866095,2.347346,-0.059052,1.630085,-1.417297,-0.668289,-1.69498,0.630421
1,-0.303281,-1.35368,-0.46532,-0.858209,-1.529307,0.313493,-1.249236,0.753106,1.498213,-0.130721
0,-0.392595,0.243323,1.131539,1.803519,-0.808438,1.218778,-1.966854,-0.030867,-0.988701,0.109282
1,0.866506,-1.2996,-0.777128,-1.144182,-0.501701,0.012168,-0.605654,0.638851,0.89133,-0.97573


In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
# The difference in result is caused by using ddof = 1 for sample variance in dsds
# and using ddof = 0 in sklearn.

# Long and convoluted code just to do some scaling...
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
# scaled2[:5, :] # scaled2 is a numpy matrix
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]
scaled2.head()

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,target
0,-0.729039,-0.153744,0.728232,-0.266937,-0.374598,-0.715868,0.176412,-0.038076,0.378098,0.475553,1
1,-0.281205,0.814441,0.866095,2.347346,-0.059052,1.630085,-1.417297,-0.668289,-1.69498,0.630421,0
2,-0.303281,-1.35368,-0.46532,-0.858209,-1.529307,0.313493,-1.249236,0.753106,1.498213,-0.130721,1
3,-0.392595,0.243323,1.131539,1.803519,-0.808438,1.218778,-1.966854,-0.030867,-0.988701,0.109282,0
4,0.866506,-1.2996,-0.777128,-1.144182,-0.501701,0.012168,-0.605654,0.638851,0.89133,-0.97573,1


In [9]:
%%timeit
scaled = t.scale(df, cols=features, strategy="standard")

3 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

22.6 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

In [11]:
import dsds.transform as t
t.impute(df, cols=features, strategy="median").head(3) 

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.298269,-0.243838,1.000231,0.250288,-0.850341,-0.335285,-0.774818,0.630436,0.082388,-0.397133
0,0.342461,1.261295,1.189653,5.469217,-0.330301,2.873605,-2.769486,-0.562839,-3.155674,-0.199987
1,0.310876,-2.109248,-0.639685,-0.930077,-2.753375,1.072716,-2.559143,2.1285,1.831962,-1.168917


In [12]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)
imputed.head(3)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1.0,-0.298269,-0.243838,1.000231,0.250288,-0.850341,-0.335285,-0.774818,0.630436,0.082388,-0.397133
1,0.0,0.342461,1.261295,1.189653,5.469217,-0.330301,2.873605,-2.769486,-0.562839,-3.155674,-0.199987
2,1.0,0.310876,-2.109248,-0.639685,-0.930077,-2.753375,1.072716,-2.559143,2.1285,1.831962,-1.168917


In [13]:
%%timeit
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

129 ms ± 4.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [14]:
%%timeit
t.impute(df, cols=features, strategy="median")

1.28 ms ± 50.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Fscore

In [15]:
import dsds.fs as fs # fs = feature_selection
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression

In [16]:
fs._f_score(df, target=target, num_list = features)

array([1.90352311e+03, 6.77915923e+04, 1.99931667e-01, 4.11440394e+04,
       1.69224087e+03, 8.43065971e+03, 2.66949949e-02, 1.40517362e+02,
       1.11763545e+04, 1.25595736e+00])

In [17]:
# The more core, the bigger the difference. Data here is not big enough to show the difference
start = perf_counter()
res = fs.f_classif(df, target=target)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.00s in computing Fscore.


feature,f_value,p_value
str,f64,f64
"""column_0""",1903.523105,0.0
"""column_1""",67791.592265,0.0
"""column_2""",0.199932,0.654777
"""column_3""",41144.039363,0.0
"""column_4""",1692.240868,0.0
"""column_5""",8430.659706,0.0
"""column_6""",0.026695,0.870215
"""column_7""",140.517362,2.1567999999999999e-32
"""column_8""",11176.354518,0.0
"""column_9""",1.255957,0.26242


In [18]:
start = perf_counter()
f, pv = f_classif(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.02s in computing Fscore.


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,feature,f_value,p_value
0,column_0,1903.523105,0.0
1,column_1,67791.592265,0.0
2,column_2,0.199932,0.654777
3,column_3,41144.039363,0.0
4,column_4,1692.240868,0.0
5,column_5,8430.659706,0.0
6,column_6,0.026695,0.8702148
7,column_7,140.517362,2.1567630000000002e-32
8,column_8,11176.354518,0.0
9,column_9,1.255957,0.2624204


In [19]:
start = perf_counter()
f, pv = f_regression(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start: .2f}s in computing Fscore.")
res.head(10)

Spent  0.01s in computing Fscore.


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,feature,f_value,p_value
0,column_0,1903.523105,0.0
1,column_1,67791.592265,0.0
2,column_2,0.199932,0.654777
3,column_3,41144.039363,0.0
4,column_4,1692.240868,0.0
5,column_5,8430.659706,0.0
6,column_6,0.026695,0.8702148
7,column_7,140.517362,2.1567630000000002e-32
8,column_8,11176.354518,0.0
9,column_9,1.255957,0.2624204


# Mutual Information Score

In [20]:
# Vastly faster than sklearn. Finished in 0.7s in this run
fs.mutual_info(df, target=target, conti_cols=features).sort(by="estimated_mi", descending=True).limit(10)

Mutual Info: 100%|██████████| 10/10 [00:00<00:00, 13.06it/s]


feature,estimated_mi
str,f64
"""column_1""",0.293065
"""column_3""",0.220958
"""column_5""",0.058333
"""column_8""",0.056483
"""column_2""",0.047116
"""column_0""",0.028254
"""column_9""",0.011597
"""column_4""",0.009725
"""column_7""",0.008533
"""column_6""",0.000833


In [21]:
# A wrapper for more apples to apples comparison
def estimate_mi_sklearn(df:pd.DataFrame, cols:list[str], target:str, k=3, random_state:int=42):
    mi_estimates = mutual_info_classif(df[cols], df[target]
                        , n_neighbors=k, random_state=random_state, discrete_features=False)

    return pl.from_records([cols, mi_estimates], schema=["feature", "estimated_mi"]).sort("estimated_mi", descending=True)

In [22]:
# The reason sklearn's impl is slow is that it did not turn on multithreading for KDtrees.
# Sklearn also did not provide an option to turn it on, despite the fact that sklearn's KDtrees
# does have this functionality. Finished in 4.4s in this run
estimate_mi_sklearn(df_pd, cols=features, target=target).limit(10)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


feature,estimated_mi
str,f64
"""column_1""",0.293058
"""column_3""",0.220958
"""column_5""",0.058333
"""column_8""",0.056483
"""column_2""",0.047116
"""column_0""",0.028254
"""column_9""",0.011595
"""column_4""",0.009725
"""column_7""",0.008533
"""column_6""",0.000823


# MRMR Feature selection Strategy

In [23]:
from mrmr import mrmr_classif # This is currently the most starred MRMR Python package on github

In [24]:
# Need to wrap it so that we get apples to apples comparison
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    features = list(df.columns)
    features.remove(target)
    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [27]:
out1 = mrmr_package(df_pd, "target", 50)
out1 

100%|██████████| 10/10 [00:06<00:00,  1.62it/s]

Spent 9.04s to compute mrmr.





['column_1',
 'column_8',
 'column_3',
 'column_5',
 'column_4',
 'column_0',
 'column_7',
 'column_9',
 'column_2',
 'column_6']

In [28]:
start = perf_counter()
# mrmr from dsds package. This is actually a slow version, but is much less memory intensive 
# and is more practical for real life data. See docstring for more details.
out2 = fs.mrmr(df, target="target", k = 50) 
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
out2

INFO:dsds.fs:Running fscore to determine feature relevance...
INFO:dsds.fs:Found 10 total features to select from. Proceeding to select top 10 features.
MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 769.07it/s]

Output is sorted in order of selection (max relevance min redundancy).
Spent 0.02s in computing.





['column_1',
 'column_8',
 'column_3',
 'column_5',
 'column_4',
 'column_0',
 'column_7',
 'column_9',
 'column_2',
 'column_6']

In [29]:
out1 == out2 

True

In [30]:
# DSDS packages provides another kind of MRMR
fs.knock_out_mrmr(df, target="target", k = 50) 

INFO:dsds.fs:Running fscore to determine feature relevance...
Knock out MRMR, fscore:  80%|████████  | 8/10 [00:00<00:00, 258.00it/s]

Found only 8/50 number of values because most of them are highly correlated and the knock out rule eliminates most of them.
Output is sorted in order of selection (max relevance min redundancy).





['column_1',
 'column_3',
 'column_8',
 'column_5',
 'column_0',
 'column_9',
 'column_2',
 'column_6']

In [32]:
start = perf_counter()
fs.knock_out_mrmr(df, target="target", k = 50) 
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")

INFO:dsds.fs:Running fscore to determine feature relevance...
Knock out MRMR, fscore:  80%|████████  | 8/10 [00:00<00:00, 258.00it/s]

Found only 8/50 number of values because most of them are highly correlated and the knock out rule eliminates most of them.
Output is sorted in order of selection (max relevance min redundancy).
Spent 0.04s in computing.





# Power Transform

In [33]:
start = perf_counter()
res_eager = t.power_transform(df, cols=features, strategy="yeo_johnson")
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res_eager.head() 

Spent 0.22s in computing.


target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.291056,-0.253004,1.086412,0.240437,-0.82934,-0.330533,-0.77679,0.629706,0.082261,-0.398976
0,0.35218,1.088472,1.307916,3.644092,-0.326673,3.108431,-2.78762,-0.563431,-3.263988,-0.20048
1,0.318943,-2.637178,-0.604741,-1.058455,-2.595465,1.11509,-2.575055,2.122199,1.789716,-1.182391
0,0.185976,0.353891,1.745719,3.060092,-1.503868,2.473689,-3.483348,0.643328,-2.105306,-0.871208
1,2.23259,-2.516223,-0.980903,-1.811442,-1.028683,0.677985,-1.762048,1.906909,0.872257,-2.286343


In [34]:
from sklearn.preprocessing import power_transform

In [35]:
# Sklearn with Pandas

start = perf_counter()
transformed = power_transform(df_pd[features], method = "yeo-johnson", standardize=False)
end = perf_counter()
df_pd[features] = transformed
print(f"Spent {end - start:.2f}s in computing.")
df_pd.head()


Spent 0.75s in computing.


Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1,-0.291056,-0.253004,1.086412,0.240437,-0.82934,-0.330533,-0.77679,0.629706,0.082261,-0.398976
1,0,0.35218,1.088472,1.307916,3.644092,-0.326673,3.108431,-2.78762,-0.563431,-3.263988,-0.20048
2,1,0.318943,-2.637178,-0.604741,-1.058455,-2.595465,1.11509,-2.575055,2.122199,1.789716,-1.182391
3,0,0.185976,0.353891,1.745719,3.060092,-1.503868,2.473689,-3.483348,0.643328,-2.105306,-0.871208
4,1,2.23259,-2.516223,-0.980903,-1.811442,-1.028683,0.677985,-1.762048,1.906909,0.872257,-2.286343


# GC

In [36]:
import gc 
gc.collect()

231