In [1]:
import polars as pl
import dsds.fs as fs
from sklearn.datasets import make_classification

In [2]:
orig_x, orig_y = make_classification(n_samples = 10_000, n_features = 500, n_informative = 60, n_redundant = 440)
df = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y)) 
target = "target"
features = df.columns
features.remove(target)

# Correlation Among Features

In [3]:
import dsds.prescreen as ps

In [4]:
ps.infer_highly_correlated(df, threshold=0.5).head()

features,corr > 0.50,corr < -0.50
str,list[str],list[str]
"""column_241""","[""column_357""]",[]
"""column_38""","[""column_361""]",[]
"""column_31""",[],"[""column_426""]"
"""column_260""","[""column_162""]",[]
"""column_462""",[],"[""column_355""]"


In [5]:
ps.corr_report(df, features, ["target"]).sort("target", descending=True).head()

features,target
str,f64
"""column_84""",0.235109
"""column_446""",0.231165
"""column_475""",0.226831
"""column_144""",0.219128
"""column_296""",0.21785


# MRMR w/ Feature Relevance from LGBM

1. Good first option for models with less columns.

In [6]:
# LGBM Based Feature Importance.
# LGBM hyperparameters is automatically suggested by Optuna with 30-trial tuning.
# More user control on this tuning process will be added in the future.

f1 = fs.mrmr(
    df,
    "target",
    50,
    relevance="lgbm",
    mrmr_strategy="accum_corr"
)

INFO:dsds.fs:Running lgbm to determine feature relevance...
  from .autonotebook import tqdm as notebook_tqdm
[I 2023-10-21 15:52:59,297] A new study created in memory with name: no-name-d7991179-fb99-4f59-87f1-9b9ed4a1a836
[I 2023-10-21 15:53:04,265] Trial 2 finished with value: 0.3915189573229743 and parameters: {'max_depth': 2, 'num_iterations': 95, 'lambda_l1': 2.5330921736559406, 'lambda_l2': 0.0667369300771651, 'num_leaves': 246, 'feature_fraction': 0.7667114801160662, 'bagging_fraction': 0.6861971205162487, 'bagging_freq': 2, 'min_child_samples': 29}. Best is trial 2 with value: 0.3915189573229743.
[I 2023-10-21 15:53:06,336] Trial 1 finished with value: 0.2549140038621587 and parameters: {'max_depth': 4, 'num_iterations': 89, 'lambda_l1': 1.9924060584270356e-08, 'lambda_l2': 1.7043535867973099, 'num_leaves': 104, 'feature_fraction': 0.7800630956450376, 'bagging_fraction': 0.6203489992275716, 'bagging_freq': 4, 'min_child_samples': 70}. Best is trial 1 with value: 0.254914003862

Best params: {'max_depth': 15, 'num_iterations': 165, 'lambda_l1': 0.00045899233706944215, 'lambda_l2': 2.700749887572202e-06, 'num_leaves': 199, 'feature_fraction': 0.9458938205876221, 'bagging_fraction': 0.7458611228429407, 'bagging_freq': 2, 'min_child_samples': 64}.
Found at trial: 10.
Time took: 38s.


INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 38.23it/s]


# MRMR w/ Feature Relevance From F-score

1. Super fast. But may suffer from univariate feature selection problem.

In [7]:
f2 = fs.mrmr(
    df,
    "target",
    50,
    relevance="f",
    mrmr_strategy= "accum_corr"
)

INFO:dsds.fs:Running f to determine feature relevance...
INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
MRMR: 100%|██████████| 50/50 [00:01<00:00, 38.17it/s]


# MRMR w/ Custom Score

1. Use fs.mrmr_engine

In [8]:
# random feature importance
relevance = {
    features[i] : i
    for i in range(len(features)) 
}

In [9]:
# Expect to see 499 at top, then features in the 400s which are less correlated with the top features, etc.
fs.mrmr_engine(
    df,
    50,
    relevance=relevance,
    strategy="weighted_accum_corr",
    verbose = True 
)

INFO:dsds.fs:Found 500 total features to select from. Proceeding to select top 50 features.
INFO:dsds.fs:Round 2: The top 20 features, relative score, and the accumulated correlation are the following:
[('column_311', 586358.0420327485, 0.0005303926572267093), ('column_331', 179138.3996991347, 0.0018477333757358492), ('column_338', 167615.6924406437, 0.002016517636734359), ('column_330', 147210.92843257933, 0.0022416813990215113), ('column_387', 137475.34808832582, 0.0028150501554021047), ('column_317', 114249.5471788471, 0.0027746280648602117), ('column_470', 113132.33715116352, 0.004154426681488974), ('column_437', 112718.9968886523, 0.0038768975244845697), ('column_454', 86460.96192119618, 0.005250924693780216), ('column_378', 73179.52435753861, 0.005165379295896725), ('column_277', 66129.1779979393, 0.004188771256292219), ('column_293', 61694.10812286465, 0.004749237956669809), ('column_307', 58418.22062354865, 0.005255209705518605), ('column_333', 58193.15877925877, 0.005722322124

['column_499',
 'column_311',
 'column_276',
 'column_336',
 'column_496',
 'column_380',
 'column_452',
 'column_331',
 'column_471',
 'column_472',
 'column_396',
 'column_494',
 'column_405',
 'column_377',
 'column_454',
 'column_462',
 'column_408',
 'column_371',
 'column_460',
 'column_456',
 'column_440',
 'column_482',
 'column_475',
 'column_479',
 'column_483',
 'column_429',
 'column_493',
 'column_480',
 'column_469',
 'column_425',
 'column_406',
 'column_485',
 'column_464',
 'column_433',
 'column_455',
 'column_458',
 'column_423',
 'column_498',
 'column_484',
 'column_473',
 'column_486',
 'column_418',
 'column_487',
 'column_481',
 'column_495',
 'column_426',
 'column_491',
 'column_497',
 'column_434',
 'column_489']

In [11]:
import dsds.metrics as me

me.jaccard_similarity(f1, f2, parallel=False)

0.19047619047619047