In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
import dsds.fs as fs

In [2]:
orig_x, orig_y = make_classification(n_samples = 10_000, n_features = 500, n_informative = 60, n_redundant = 440)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
# Model Based Feature Importance.
# LGBM hyperparameters is automatically suggested by Optuna with 30-trial tuning.
# More user control on this tuning process will be added in the future.

f1 = fs.mrmr(
    df,
    "target",
    50,
    relevance="lgbm",
    mrmr_strategy="accum_corr"
)

INFO:dsds.fs:Running lgbm to determine feature relevance...
  from .autonotebook import tqdm as notebook_tqdm
[I 2023-10-20 23:37:29,161] A new study created in memory with name: no-name-708c78e4-d418-4f84-86ea-e27940a1dc8b
[I 2023-10-20 23:37:33,397] Trial 1 finished with value: 0.3953450255482715 and parameters: {'max_depth': 2, 'num_iterations': 96, 'lambda_l1': 0.007803892710811089, 'lambda_l2': 1.1557216581786632e-05, 'num_leaves': 243, 'feature_fraction': 0.9072374278648352, 'bagging_fraction': 0.6750668298315728, 'bagging_freq': 3, 'min_child_samples': 57}. Best is trial 1 with value: 0.3953450255482715.
[I 2023-10-20 23:37:33,874] Trial 3 finished with value: 0.37208456988288874 and parameters: {'max_depth': 4, 'num_iterations': 85, 'lambda_l1': 0.027679527229476256, 'lambda_l2': 0.010502806072540817, 'num_leaves': 5, 'feature_fraction': 0.5484625734994388, 'bagging_fraction': 0.5879998769630729, 'bagging_freq': 4, 'min_child_samples': 63}. Best is trial 3 with value: 0.3720845

Best params: {'max_depth': 16, 'num_iterations': 194, 'lambda_l1': 0.14040535233539092, 'lambda_l2': 3.102097013830383e-08, 'num_leaves': 86, 'feature_fraction': 0.9839214535525003, 'bagging_fraction': 0.8105623385618879, 'bagging_freq': 1, 'min_child_samples': 98}.
Found at trial: 26.
Time took: 26s.


INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 38.69it/s]

Output is sorted in order of selection (max relevance min redundancy).





In [4]:
f2 = fs.mrmr(
    df,
    "target",
    50,
    relevance="f",
    mrmr_strategy= "accum_corr"
)

INFO:dsds.fs:Running f to determine feature relevance...
INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 40.35it/s]

Output is sorted in order of selection (max relevance min redundancy).





In [5]:
import dsds.metrics as me


me.jaccard_similarity(f1, f2)

0.26582278481012656