In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter


In [2]:
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 10, n_informative = 5, n_redundant = 5)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
df.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,-1.915925,-4.003424,-1.320002,0.946308,2.492093,-2.156989,-1.736505,-4.783813,2.565997,-5.554686
1,4.529332,-0.092049,2.554487,-1.177019,-2.501587,1.37687,-2.787249,0.094123,3.026037,1.029492
1,0.61663,1.247445,0.826013,1.086297,1.907177,-0.86425,0.341153,-1.756902,0.42313,-0.433211
0,3.971382,0.54888,0.32097,1.947411,-3.258481,-0.690176,-1.487264,2.274612,1.568011,0.470549
0,2.568516,-0.770287,1.58601,0.021154,-1.176578,-1.006707,-2.02971,-0.823046,1.851651,0.017566


## Comparisons

This notebook compares results and performance between the dsds package, sklearn and some other packages for feature selection and some other transformations common in the data science pipeline.

### Methods Compared:
1. Scaling and Imputation
2. Fscore
3. Mutual Information Score
4. MRMR feature selection strategies
5. Power Transform

You may restart the kernel after each section. But remember to rerun the cells above. If you are concerned about memory usage when running this notebook, go to the end and run the gc cell.

# Scaling and Imputation

In [4]:
import dsds.transform as t

In [5]:
features = df.columns
features.remove("target")

In [6]:
scaled = t.scale(df, cols=features, strategy="standard")
scaled.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,-1.346559,-2.309483,-0.829968,0.434469,1.456466,-1.396473,-0.987196,-2.121938,1.377123,-2.8757
1,2.220117,0.270753,1.605781,-0.873723,-1.463483,0.89528,-1.937195,0.37219,1.782193,1.21585
1,0.054906,1.154383,0.519153,0.520717,1.114449,-0.558115,0.891258,-0.574254,-0.509698,0.306895
0,1.911359,0.693557,0.201651,1.051253,-1.90606,-0.445226,-0.761852,1.487091,0.498385,0.86851
0,1.135041,-0.176664,0.996935,-0.135523,-0.688712,-0.6505,-1.252289,-0.096766,0.748132,0.587017


In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
# The difference in result is caused by using ddof = 1 for sample variance in dsds
# and using ddof = 0 in sklearn.

# Long and convoluted code just to do some scaling...
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
# scaled2[:5, :] # scaled2 is a numpy matrix
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]
scaled2.head()

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,target
0,-1.346566,-2.309494,-0.829972,0.434471,1.456473,-1.39648,-0.987201,-2.121949,1.37713,-2.875714,0
1,2.220128,0.270754,1.605789,-0.873727,-1.46349,0.895285,-1.937205,0.372191,1.782202,1.215856,1
2,0.054906,1.154389,0.519155,0.520719,1.114454,-0.558118,0.891262,-0.574257,-0.5097,0.306896,1
3,1.911368,0.69356,0.201652,1.051258,-1.90607,-0.445228,-0.761856,1.487098,0.498387,0.868515,0
4,1.135046,-0.176665,0.99694,-0.135524,-0.688715,-0.650503,-1.252295,-0.096767,0.748136,0.58702,0


In [9]:
%%timeit
scaled = t.scale(df, cols=features, strategy="standard")

2.76 ms ± 25.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]


13.1 ms ± 96.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
t.impute(df, cols=features, strategy="median").head() 

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,-1.915925,-4.003424,-1.320002,0.946308,2.492093,-2.156989,-1.736505,-4.783813,2.565997,-5.554686
1,4.529332,-0.092049,2.554487,-1.177019,-2.501587,1.37687,-2.787249,0.094123,3.026037,1.029492
1,0.61663,1.247445,0.826013,1.086297,1.907177,-0.86425,0.341153,-1.756902,0.42313,-0.433211
0,3.971382,0.54888,0.32097,1.947411,-3.258481,-0.690176,-1.487264,2.274612,1.568011,0.470549
0,2.568516,-0.770287,1.58601,0.021154,-1.176578,-1.006707,-2.02971,-0.823046,1.851651,0.017566


In [12]:
from sklearn.impute import SimpleImputer 

imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)
imputed.head()

Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,0.0,-1.915925,-4.003424,-1.320002,0.946308,2.492093,-2.156989,-1.736505,-4.783813,2.565997,-5.554686
1,1.0,4.529332,-0.092049,2.554487,-1.177019,-2.501587,1.37687,-2.787249,0.094123,3.026037,1.029492
2,1.0,0.61663,1.247445,0.826013,1.086297,1.907177,-0.86425,0.341153,-1.756902,0.42313,-0.433211
3,0.0,3.971382,0.54888,0.32097,1.947411,-3.258481,-0.690176,-1.487264,2.274612,1.568011,0.470549
4,0.0,2.568516,-0.770287,1.58601,0.021154,-1.176578,-1.006707,-2.02971,-0.823046,1.851651,0.017566


In [13]:
%%timeit
t.impute(df, cols=features, strategy="median")

1.27 ms ± 8.01 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
%%timeit
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)

119 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Fscore

In [15]:
import dsds.fs as fs # fs = feature_selection
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression

In [16]:
fs._f_score(df, target=target, num_list = features)

array([2.22045228e+02, 1.18428679e+04, 1.77695826e+00, 4.39617346e+03,
       1.41954302e-01, 7.02406170e+04, 5.31111372e+03, 3.20924452e+03,
       1.54537682e+00, 3.03490700e+03])

In [17]:
# The more core, the bigger the difference. Data here is not big enough to show the difference
start = perf_counter()
res = fs.f_classif(df, target=target)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.00s in computing Fscore.


feature,f_value,p_value
str,f64,f64
"""column_0""",222.045228,3.6660999999999995e-50
"""column_1""",11842.867863,0.0
"""column_2""",1.776958,0.182526
"""column_3""",4396.173462,0.0
"""column_4""",0.141954,0.706347
"""column_5""",70240.617017,0.0
"""column_6""",5311.113716,0.0
"""column_7""",3209.244521,0.0
"""column_8""",1.545377,0.213822
"""column_9""",3034.907,0.0


In [18]:
start = perf_counter()
f, pv = f_classif(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.02s in computing Fscore.


Unnamed: 0,feature,f_value,p_value
0,column_0,222.045228,3.6661499999999997e-50
1,column_1,11842.867863,0.0
2,column_2,1.776958,0.1825263
3,column_3,4396.173462,0.0
4,column_4,0.141954,0.7063466
5,column_5,70240.617017,0.0
6,column_6,5311.113716,0.0
7,column_7,3209.244521,0.0
8,column_8,1.545377,0.2138222
9,column_9,3034.907,0.0


In [19]:
start = perf_counter()
f, pv = f_regression(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start: .2f}s in computing Fscore.")
res.head(10)

Spent  0.01s in computing Fscore.


Unnamed: 0,feature,f_value,p_value
0,column_0,222.045228,3.6661499999999997e-50
1,column_1,11842.867863,0.0
2,column_2,1.776958,0.1825263
3,column_3,4396.173462,0.0
4,column_4,0.141954,0.7063466
5,column_5,70240.617017,0.0
6,column_6,5311.113716,0.0
7,column_7,3209.244521,0.0
8,column_8,1.545377,0.2138222
9,column_9,3034.907,0.0


# Mutual Information Score

In [20]:
# Vastly faster than sklearn. Finished in 0.7s in this run
fs.mutual_info(df, target=target, conti_cols=features).sort(by="estimated_mi", descending=True).limit(10)

Mutual Info: 100%|██████████| 10/10 [00:00<00:00, 13.10it/s]


feature,estimated_mi
str,f64
"""column_5""",0.26089
"""column_1""",0.099094
"""column_7""",0.06397
"""column_9""",0.055507
"""column_6""",0.029568
"""column_3""",0.0295
"""column_4""",0.014645
"""column_2""",0.013376
"""column_0""",0.008127
"""column_8""",4.6e-05


In [21]:
def estimate_mi_sklearn(df:pd.DataFrame, cols:list[str], target:str, k=3, random_state:int=42):
    mi_estimates = mutual_info_classif(df[cols], df[target]
                        , n_neighbors=k, random_state=random_state, discrete_features=False)

    return pl.from_records([cols, mi_estimates], schema=["feature", "estimated_mi"]).sort("estimated_mi", descending=True)

In [22]:
# The reason sklearn's impl is slow is that it did not turn on multithreading for KDtrees.
# Sklearn also did not provide an option to turn it on, despite the fact that sklearn's KDtrees
# does have this functionality. Finished in 4.4s in this run
estimate_mi_sklearn(df_pd, cols=features, target=target).limit(10)

feature,estimated_mi
str,f64
"""column_5""",0.260888
"""column_1""",0.099094
"""column_7""",0.06397
"""column_9""",0.055507
"""column_6""",0.029568
"""column_3""",0.0295
"""column_4""",0.014645
"""column_2""",0.013376
"""column_0""",0.008127
"""column_8""",4.4e-05


# MRMR Feature selection Strategy

In [23]:
from mrmr import mrmr_classif # This is currently the most starred MRMR Python package on github

In [24]:
# Need to wrap it so that we get apples to apples comparison
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    features = list(df.columns)
    features.remove(target)
    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [25]:
mrmr_package(df_pd, "target", 50)

100%|██████████| 10/10 [00:05<00:00,  1.67it/s]

Spent 8.78s to compute mrmr.





['column_5',
 'column_1',
 'column_3',
 'column_6',
 'column_7',
 'column_9',
 'column_0',
 'column_2',
 'column_8',
 'column_4']

In [26]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=False)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

INFO:dsds.fs:Running fscore to determine feature relevance...


Found 10 total features to select from. Proceeding to select top 10 features.


MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 1999.86it/s]

Output is sorted in order of selection (max relevance min redundancy).
Spent 0.02s in computing.





['column_5',
 'column_1',
 'column_3',
 'column_6',
 'column_7',
 'column_9',
 'column_0',
 'column_2',
 'column_8',
 'column_4']

In [27]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=True)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

INFO:dsds.fs:Running fscore to determine feature relevance...


Found 10 total features to select from. Proceeding to select top 10 features.


MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 624.85it/s]

Output is sorted in order of selection (max relevance min redundancy).
Spent 0.02s in computing.





['column_5',
 'column_1',
 'column_3',
 'column_6',
 'column_7',
 'column_9',
 'column_0',
 'column_2',
 'column_8',
 'column_4']

# Power Transform

In [28]:
# Eager transform.
start = perf_counter()
res_eager = t.power_transform(df, cols=features, strategy="yeo_johnson")
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res_eager.head() 

Spent 0.24s in computing.


target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,-1.907149,-3.82021,-1.28803,0.985707,2.612754,-2.207456,-1.862715,-4.865948,2.613976,-3.344589
1,4.56549,-0.091858,2.657185,-1.12199,-2.386905,1.354094,-3.071196,0.094058,3.089043,1.22814
1,0.617799,1.274414,0.840205,1.136894,1.983762,-0.874359,0.334994,-1.772729,0.425041,-0.399006
0,4.000742,0.554909,0.323381,2.089364,-3.08234,-0.696871,-1.583403,2.250382,1.588767,0.51525
0,2.582966,-0.759197,1.631511,0.021178,-1.144841,-1.020029,-2.195168,-0.827176,1.879302,0.017633


In [29]:
from sklearn.preprocessing import power_transform

In [30]:
# Sklearn with Pandas

start = perf_counter()
transformed = power_transform(df_pd[features], method = "yeo-johnson", standardize=False)
end = perf_counter()
df_pd[features] = transformed
print(f"Spent {end - start:.2f}s in computing.")
df_pd.head()


Spent 0.73s in computing.


Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,0,-1.907149,-3.82021,-1.28803,0.985707,2.612754,-2.207456,-1.862715,-4.865948,2.613976,-3.344589
1,1,4.56549,-0.091858,2.657185,-1.12199,-2.386905,1.354094,-3.071196,0.094058,3.089043,1.22814
2,1,0.617799,1.274414,0.840205,1.136894,1.983762,-0.874359,0.334994,-1.772729,0.425041,-0.399006
3,0,4.000742,0.554909,0.323381,2.089364,-3.08234,-0.696871,-1.583403,2.250382,1.588767,0.51525
4,0,2.582966,-0.759197,1.631511,0.021178,-1.144841,-1.020029,-2.195168,-0.827176,1.879302,0.017633


# GC

In [31]:
import gc 
gc.collect()

20