In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter

import dsds.transform as t
import dsds.fs as fs

from sklearn.preprocessing import power_transform
from sklearn.feature_selection import mutual_info_classif, f_classif, f_regression

In [2]:
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 100, n_informative = 60, n_redundant = 40)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [None]:
df.head()

## Comparisons

This notebook compares results and performance between the dsds package, sklearn and some other packages for feature selection and some other transformations common in the data science pipeline.

### Methods Compared:
1. Fscore
2. Mutual Information Score
3. MRMR feature selection strategies
4. Power Transform

You may restart the kernel after each section. But remember to rerun the cells above. If you are concerned about memory usage when running this notebook, go to the end and run the gc cell.

# Fscore

In [None]:
start = perf_counter()
res = fs.f_classif(df, target=target)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

In [None]:
start = perf_counter()
f, pv = f_classif(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing Fscore.")
res.head(10)

In [None]:
start = perf_counter()
f, pv = f_regression(df_pd[features], df_pd[target])
res = pd.DataFrame({"feature":features, "f_value":f, "p_value":pv})
end = perf_counter()
print(f"Spent {end - start: .2f}s in computing Fscore.")
res.head(10)

# Mutual Information Score

In [None]:
# Took 16s
fs.mutual_info(df, target=target, conti_cols=features).sort(by="estimated_mi", descending=True).limit(10)

In [None]:
def estimate_mi_sklearn(df:pd.DataFrame, cols:list[str], target:str, k=3, random_state:int=42):
    mi_estimates = mutual_info_classif(df[cols], df[target]
                        , n_neighbors=k, random_state=random_state, discrete_features=False)

    return pl.from_records([cols, mi_estimates], schema=["feature", "estimated_mi"]).sort("estimated_mi", descending=True)

In [None]:
# Took 1m38s seconds by using sklearn. The reason is that sklearn's implementation did not turn on multithreading for KDtrees.
# Sklearn also did not provide an option to turn it on, despite the fact that sklearn's KDtrees
# does have this functionality.
estimate_mi_sklearn(df_pd, cols=features, target=target).limit(10)

# MRMR Feature selection Strategy

In [None]:
from mrmr import mrmr_classif # This is currently the most starred MRMR Python package on github

In [None]:
# Need to wrap it so that we get apples to apples comparison
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    features = list(df.columns)
    features.remove(target)
    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [None]:
mrmr_package(df_pd, "target", 50)

In [None]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=False)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

In [None]:
start = perf_counter()
res = fs.mrmr(df, target="target", k = 50, low_memory=True)
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res

# Power Transform

In [None]:
# Eager transform.
start = perf_counter()
res_eager = t.power_transform(df, target = target, strategy="yeo_johnson")
end = perf_counter()
print(f"Spent {end - start:.2f}s in computing.")
res_eager.head() 

In [None]:
# Sklearn with Pandas

# In this case, the benefits of using dsds are:
# 1. Much better performance.
# 2. Users need to do extra steps to put data back in dataframe format.
# 3. Dsds can save the plan using LazyFrame.blueprint.preserve_as_pickle()... But in Pandas,
# you have to use the PowerTransformer class, which makes code longer and uglier (and less organized)

start = perf_counter()
transformed = power_transform(df_pd[features], method = "yeo-johnson", standardize=False)
end = perf_counter()
df_pd[features] = transformed
print(f"Spent {end - start:.2f}s in computing.")
df_pd.head()


# GC

In [None]:
import gc 
gc.collect()