In [1]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from time import perf_counter
import dsds.fs as fs

In [2]:
orig_x, orig_y = make_classification(n_samples = 10_000, n_features = 500, n_informative = 60, n_redundant = 440)
# This is a Polars dataframe. This is dsds package's favored dataframe. dsds relies on Polars heavily.
# You must turn other dataframe formats into Polars for dsds to work.
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
# Turn it into Pandas.
df_pd = df.to_pandas()
target = "target"
features = df.columns
features.remove(target)

In [3]:
# Model Based Feature Importance.
# LGBM hyperparameters is automatically suggested by Optuna with 30-trial tuning.
# More user control on this tuning process will be added in the future.

f1 = fs.mrmr(
    df,
    "target",
    50,
    relevance="lgbm",
    mrmr_strategy="accum_corr"
)

INFO:dsds.fs:Running lgbm to determine feature relevance...
  from .autonotebook import tqdm as notebook_tqdm
[I 2023-10-20 18:12:17,586] A new study created in memory with name: no-name-2b8de4b1-3ef2-462c-9531-9c87709eebe3
[I 2023-10-20 18:12:22,600] Trial 2 finished with value: 0.30813068239193947 and parameters: {'max_depth': 3, 'num_iterations': 70, 'lambda_l1': 4.3003037659047855e-06, 'lambda_l2': 5.132396374292552e-05, 'num_leaves': 28, 'feature_fraction': 0.5833776465828601, 'bagging_fraction': 0.8886136516425738, 'bagging_freq': 3, 'min_child_samples': 40}. Best is trial 2 with value: 0.30813068239193947.
[I 2023-10-20 18:12:22,604] Trial 0 finished with value: 0.2680084801253281 and parameters: {'max_depth': 3, 'num_iterations': 112, 'lambda_l1': 6.591353066809201, 'lambda_l2': 0.0005345753502586614, 'num_leaves': 118, 'feature_fraction': 0.7315192463335448, 'bagging_fraction': 0.8476437866032045, 'bagging_freq': 6, 'min_child_samples': 67}. Best is trial 0 with value: 0.26800

Best params: {'max_depth': 10, 'num_iterations': 197, 'lambda_l1': 0.00015708897675919718, 'lambda_l2': 4.749416557527707e-08, 'num_leaves': 56, 'feature_fraction': 0.7199045301990853, 'bagging_fraction': 0.6767599703172633, 'bagging_freq': 2, 'min_child_samples': 48}.
Found at trial: 28.
Time took: 0:00:24.446533


INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 40.09it/s]

Output is sorted in order of selection (max relevance min redundancy).





In [6]:
f2 = fs.mrmr(
    df,
    "target",
    50,
    relevance="f",
)

INFO:dsds.fs:Running f to determine feature relevance...
INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 40.91it/s]

Output is sorted in order of selection (max relevance min redundancy).





In [7]:
import dsds.metrics as me


me.jaccard_similarity(f1, f2)

0.19047619047619047