In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter
import dsds.fs as fs

In [2]:
orig_x, orig_y = make_classification(n_samples = 50_000, n_features = 500, n_informative = 60, n_redundant = 440)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [5]:
# Model Based Feature Importance.
# LGBM hyperparameters is automatically suggested by Optuna with 30-trial tuning.
# More user control on this tuning process will be added in the future.

f1 = fs.mrmr(
    df,
    "target",
    50,
    strategy="lgbm"
)

INFO:dsds.fs:Running lgbm to determine feature relevance...
[I 2023-10-16 02:30:58,446] A new study created in memory with name: no-name-6413e073-a1a9-493f-a9f4-19e7a259d42e
[I 2023-10-16 02:31:12,680] Trial 0 finished with value: 0.13441584666706316 and parameters: {'max_depth': 5, 'num_iterations': 163, 'lambda_l1': 0.0001983796110978997, 'lambda_l2': 3.041083014918056e-08, 'num_leaves': 18, 'feature_fraction': 0.9005068136092236, 'bagging_fraction': 0.4840227048354576, 'bagging_freq': 3, 'min_child_samples': 62}. Best is trial 0 with value: 0.13441584666706316.
[I 2023-10-16 02:31:12,865] Trial 3 finished with value: 0.13064693964665283 and parameters: {'max_depth': 6, 'num_iterations': 110, 'lambda_l1': 0.01660000124836358, 'lambda_l2': 4.6769465250198944e-06, 'num_leaves': 222, 'feature_fraction': 0.7241699886794994, 'bagging_fraction': 0.7888439559555096, 'bagging_freq': 4, 'min_child_samples': 9}. Best is trial 3 with value: 0.13064693964665283.
[I 2023-10-16 02:31:18,977] Trial

Best trial: FrozenTrial(number=13, state=1, values=[0.056804927116138174], datetime_start=datetime.datetime(2023, 10, 16, 2, 31, 19, 162552), datetime_complete=datetime.datetime(2023, 10, 16, 2, 33, 7, 755552), params={'max_depth': 13, 'num_iterations': 194, 'lambda_l1': 9.297739299900537e-08, 'lambda_l2': 0.000298387954914091, 'num_leaves': 125, 'feature_fraction': 0.8035751518124656, 'bagging_fraction': 0.44616390164319486, 'bagging_freq': 1, 'min_child_samples': 85}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.653016007274746, 1: 0.6175564153723667, 2: 0.5846464014793729, 3: 0.5556292379692533, 4: 0.5298899579881154, 5: 0.5063739419653597, 6: 0.4857057164488975, 7: 0.46613448509237926, 8: 0.4466016349521028, 9: 0.4283005407534769, 10: 0.41184150465710206, 11: 0.3966881033942841, 12: 0.38263593485960706, 13: 0.369578828719885, 14: 0.3571040371091509, 15: 0.34510605921818627, 16: 0.3341457347279029, 17: 0.3232401052516767, 18: 0.31356291461771085, 19: 0.30394293892532875

INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR, lgbm: 100%|██████████| 50/50 [00:01<00:00, 38.50it/s]

Output is sorted in order of selection (max relevance min redundancy).





In [6]:
f2 = fs.mrmr(
    df,
    "target",
    50,
    strategy="f"
)

INFO:dsds.fs:Running f to determine feature relevance...
INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR, f: 100%|██████████| 50/50 [00:01<00:00, 39.78it/s]

Output is sorted in order of selection (max relevance min redundancy).





In [7]:
import dsds.metrics as me


me.jaccard_similarity(f1, f2, expected_dtype="str")

0.12359550561797752