In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter


In [2]:
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 10, n_informative = 5, n_redundant = 5)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
df.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-1.45995,-0.969715,-0.494001,-0.358022,0.438434,0.759413,2.563082,0.144977,-1.577424,1.743793
1,1.476236,0.935761,-1.121335,0.877504,-0.838653,-0.359677,0.870463,1.727277,1.318132,-1.939887
0,-1.752199,1.662673,2.001634,-3.115826,1.115382,0.232854,-4.867587,-0.219266,-0.571483,-1.412335
0,-2.474012,0.039799,1.474338,-2.908336,0.833263,0.10983,-1.9125,0.218773,-0.955522,0.248039
1,0.900821,-0.278962,0.537015,0.319047,0.184227,-0.218132,-0.698765,-1.101311,0.814358,1.652277


## Comparisons

This notebook compares results and performance between the dsds package, sklearn and some other packages for feature selection and some other transformations common in the data science pipeline.

### Methods Compared:
1. Scaling and Imputation
2. Fscore
3. Mutual Information Score
4. MRMR feature selection strategies
5. Power Transform

You may restart the kernel after each section. But remember to rerun the cells above. If you are concerned about memory usage when running this notebook, go to the end and run the gc cell.

# Scaling and Imputation

In [4]:
import dsds.transform as t

In [5]:
features = df.columns
features.remove("target")

In [6]:
scaled = t.scale(df, cols=features, strategy="normal")
scaled.head()

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-0.606423,-0.571995,-0.602506,0.323383,0.23861,0.446446,1.520396,-0.22874,-0.702465,0.830077
1,1.246387,0.552232,-1.097964,1.01344,-0.959214,-0.183929,0.516765,0.780415,0.494686,-1.62206
0,-0.79084,0.981109,1.368508,-1.216887,0.873544,0.14984,-2.885586,-0.461045,-0.286565,-1.270881
0,-1.246323,0.023616,0.952057,-1.101001,0.608934,0.080541,-1.133381,-0.181674,-0.445344,-0.16561
1,0.883285,-0.164452,0.211774,0.701535,0.000181,-0.104197,-0.413702,-1.023593,0.286403,0.769158


In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
# The difference in result is caused by using ddof = 1 for sample variance in dsds
# and using ddof = 0 in sklearn.

# Long and convoluted code just to do some scaling...
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
# scaled2[:5, :] # scaled2 is a numpy matrix
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]
scaled2.head()

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,target
0,-0.606426,-0.571998,-0.60251,0.323385,0.238612,0.446448,1.520403,-0.228741,-0.702469,0.830082,1
1,1.246393,0.552235,-1.09797,1.013445,-0.959219,-0.183929,0.516768,0.780419,0.494688,-1.622068,1
2,-0.790844,0.981114,1.368515,-1.216893,0.873548,0.149841,-2.8856,-0.461047,-0.286566,-1.270887,0
3,-1.246329,0.023616,0.952062,-1.101006,0.608937,0.080541,-1.133386,-0.181675,-0.445346,-0.165611,0
4,0.88329,-0.164453,0.211775,0.701538,0.000181,-0.104198,-0.413704,-1.023598,0.286405,0.769161,1


In [9]:
%%timeit
scaled = t.scale(df, cols=features, strategy="normal")

3.73 ms ± 193 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit
std = StandardScaler()
scaled2 = std.fit_transform(df_pd[features], df_pd[target])
scaled2 = pd.DataFrame(scaled2, columns=features)
scaled2[target] = df_pd[target]


12.3 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
t.impute(df, cols=features, strategy="median").head() # 8.29 ms

target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-1.45995,-0.969715,-0.494001,-0.358022,0.438434,0.759413,2.563082,0.144977,-1.577424,1.743793
1,1.476236,0.935761,-1.121335,0.877504,-0.838653,-0.359677,0.870463,1.727277,1.318132,-1.939887
0,-1.752199,1.662673,2.001634,-3.115826,1.115382,0.232854,-4.867587,-0.219266,-0.571483,-1.412335
0,-2.474012,0.039799,1.474338,-2.908336,0.833263,0.10983,-1.9125,0.218773,-0.955522,0.248039
1,0.900821,-0.278962,0.537015,0.319047,0.184227,-0.218132,-0.698765,-1.101311,0.814358,1.652277


In [12]:
from sklearn.impute import SimpleImputer # 1.25s

imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)
imputed.head()

Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1.0,-1.45995,-0.969715,-0.494001,-0.358022,0.438434,0.759413,2.563082,0.144977,-1.577424,1.743793
1,1.0,1.476236,0.935761,-1.121335,0.877504,-0.838653,-0.359677,0.870463,1.727277,1.318132,-1.939887
2,0.0,-1.752199,1.662673,2.001634,-3.115826,1.115382,0.232854,-4.867587,-0.219266,-0.571483,-1.412335
3,0.0,-2.474012,0.039799,1.474338,-2.908336,0.833263,0.10983,-1.9125,0.218773,-0.955522,0.248039
4,1.0,0.900821,-0.278962,0.537015,0.319047,0.184227,-0.218132,-0.698765,-1.101311,0.814358,1.652277


In [13]:
%%timeit
t.impute(df, cols=features, strategy="median")

4.61 ms ± 983 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit
imputer = SimpleImputer(strategy = "median")
imputed = pd.DataFrame(imputer.fit_transform(df_pd, df_pd[target]), columns=df.columns)

151 ms ± 7.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Fscore

In [15]:
import dsds.fs as fs # fs = feature_selection
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression

In [16]:
# The more core, the bigger the difference
start = perf_counter()
res = fs.f_classif(df, target=target)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.01s in computing Fscore.


feature,f_value,p_value
str,f64,f64
"""column_0""",10570.97025,0.0
"""column_1""",0.019707,0.888358
"""column_2""",74561.483267,0.0
"""column_3""",201460.783031,0.0
"""column_4""",1990.353824,0.0
"""column_5""",6548.386941,0.0
"""column_6""",53363.521346,0.0
"""column_7""",11172.433201,0.0
"""column_8""",9094.343735,0.0
"""column_9""",12143.982969,0.0


In [17]:
start = perf_counter()
f, pv = f_classif(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

Spent 0.02s in computing Fscore.


Unnamed: 0,feature,f_value,p_value
0,column_0,10570.97025,0.0
1,column_1,0.019707,0.888358
2,column_2,74561.483267,0.0
3,column_3,201460.783031,0.0
4,column_4,1990.353824,0.0
5,column_5,6548.386941,0.0
6,column_6,53363.521346,0.0
7,column_7,11172.433201,0.0
8,column_8,9094.343735,0.0
9,column_9,12143.982969,0.0


In [18]:
start = perf_counter()
f, pv = f_regression(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start: .2f}s in computing Fscore.")
res.head(10)

Spent  0.01s in computing Fscore.


Unnamed: 0,feature,f_value,p_value
0,column_0,10570.97025,0.0
1,column_1,0.019707,0.888358
2,column_2,74561.483267,0.0
3,column_3,201460.783031,0.0
4,column_4,1990.353824,0.0
5,column_5,6548.386941,0.0
6,column_6,53363.521346,0.0
7,column_7,11172.433201,0.0
8,column_8,9094.343735,0.0
9,column_9,12143.982969,0.0


# Mutual Information Score

In [19]:
# Vastly faster than sklearn
fs.mutual_info(df, target=target, conti_cols=features).sort(by="estimated_mi", descending=True).limit(10)

Mutual Info: 100%|██████████| 10/10 [00:01<00:00,  9.85it/s]


feature,estimated_mi
str,f64
"""column_3""",0.494338
"""column_2""",0.274444
"""column_6""",0.218141
"""column_8""",0.089253
"""column_0""",0.087992
"""column_7""",0.064688
"""column_9""",0.063232
"""column_5""",0.054517
"""column_4""",0.027601
"""column_1""",0.005256


In [20]:
def estimate_mi_sklearn(df:pd.DataFrame, cols:list[str], target:str, k=3, random_state:int=42):
    mi_estimates = mutual_info_classif(df[cols], df[target]
                        , n_neighbors=k, random_state=random_state, discrete_features=False)

    return pl.from_records([cols, mi_estimates], schema=["feature", "estimated_mi"]).sort("estimated_mi", descending=True)

In [21]:
# The reason sklearn's impl is slow is that it did not turn on multithreading for KDtrees.
# Sklearn also did not provide an option to turn it on, despite the fact that sklearn's KDtrees
# does have this functionality.
estimate_mi_sklearn(df_pd, cols=features, target=target).limit(10)

feature,estimated_mi
str,f64
"""column_3""",0.494338
"""column_2""",0.274444
"""column_6""",0.218141
"""column_8""",0.089256
"""column_0""",0.087992
"""column_7""",0.064685
"""column_9""",0.063232
"""column_5""",0.054517
"""column_4""",0.027601
"""column_1""",0.005256


# MRMR Feature selection Strategy

In [22]:
from mrmr import mrmr_classif # This is currently the most starred MRMR Python package on github

In [23]:
# Need to wrap it so that we get apples to apples comparison
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    features = list(df.columns)
    features.remove(target)
    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [24]:
mrmr_package(df_pd, "target", 50)

100%|██████████| 10/10 [00:02<00:00,  3.61it/s]

Spent 4.59s to compute mrmr.





['column_3',
 'column_6',
 'column_2',
 'column_7',
 'column_0',
 'column_9',
 'column_8',
 'column_5',
 'column_4',
 'column_1']

In [25]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=False)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

Running fscore to determine feature relevance...
Found 10 total features to select from. Proceeding to select top 10 features.


MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 1720.74it/s]

Output is sorted in order of selection (relevance).
Spent 0.03s in computing.





['column_3',
 'column_6',
 'column_2',
 'column_7',
 'column_0',
 'column_9',
 'column_8',
 'column_5',
 'column_4',
 'column_1']

In [26]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=True)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

Running fscore to determine feature relevance...
Found 10 total features to select from. Proceeding to select top 10 features.


MRMR, fscore: 100%|██████████| 10/10 [00:00<00:00, 201.76it/s]

Output is sorted in order of selection (relevance).
Spent 0.06s in computing.





['column_3',
 'column_6',
 'column_2',
 'column_7',
 'column_0',
 'column_9',
 'column_8',
 'column_5',
 'column_4',
 'column_1']

# Power Transform

In [27]:
# Eager transform.
start = perf_counter()
res_eager = t.power_transform(df, cols=features, strategy="yeo_johnson")
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res_eager.head() 

Inferring best paramters: 100%|██████████| 10/10 [00:00<00:00, 19.09it/s]

Spent 0.55s in computing.





target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-1.522558,-0.9207,-0.466685,-0.355237,0.439938,0.733635,2.764655,0.146589,-1.418265,1.829884
1,1.415649,0.984821,-1.002934,0.8925,-0.833681,-0.366278,0.900587,1.89783,1.450042,-1.842743
0,-1.838329,1.800208,2.389305,-2.987505,1.123757,0.230044,-4.360761,-0.215723,-0.544952,-1.355781
0,-2.629427,0.039907,1.698894,-2.793563,0.838214,0.109178,-1.799121,0.222376,-0.888546,0.250375
1,0.875408,-0.274069,0.571497,0.321304,0.184512,-0.220648,-0.679376,-1.031075,0.869076,1.73066


In [28]:
from sklearn.preprocessing import power_transform

In [29]:
# Sklearn with Pandas

start = perf_counter()
transformed = power_transform(df_pd[features], method = "yeo-johnson", standardize=False)
end = perf_counter()
df_pd[features] = transformed
print(f"Spent {end - start:.2f}s in computing.")
df_pd.head()


Spent 0.89s in computing.


Unnamed: 0,target,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9
0,1,-1.522558,-0.9207,-0.466685,-0.355237,0.439938,0.733635,2.764655,0.146589,-1.418265,1.829884
1,1,1.415649,0.984821,-1.002934,0.8925,-0.833681,-0.366278,0.900587,1.89783,1.450042,-1.842743
2,0,-1.838329,1.800208,2.389305,-2.987505,1.123757,0.230044,-4.360761,-0.215723,-0.544952,-1.355781
3,0,-2.629427,0.039907,1.698894,-2.793563,0.838214,0.109178,-1.799121,0.222376,-0.888546,0.250375
4,1,0.875408,-0.274069,0.571497,0.321304,0.184512,-0.220648,-0.679376,-1.031075,0.869076,1.73066


# GC

In [30]:
import gc 
gc.collect()

194