In [1]:
import polars as pl 
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import f_regression
import pandas as pd
from time import perf_counter
import sys
sys.path.append('../src')
from eda.eda_selection import _f_score, mrmr, MRMR_STRATEGY, knock_out_mrmr

In [2]:
# create some data
orig_x, orig_y = make_classification(n_samples = 600_000, n_features = 100, n_informative = 20, n_redundant = 80)

In [3]:
df_pl = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y))
df_pd = df_pl.to_pandas()

In [4]:
def mrmr_my_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    df_scaled = df.select(features).with_columns(
        (pl.col(f) - pl.col(f).mean())/pl.col(f).std() for f in features
    )

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        last_selected = selected_features[-1]
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs((df_scaled.get_column(last_selected)*df_scaled.get_column(f)).mean())
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])

    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features
    

In [5]:
def mrmr_np_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    x = df.select(features).to_numpy()
    x_scaled = (x - x.mean(axis=0)) / x.std(axis=0)

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    last_selected_idx = top_idx
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs(np.mean(x_scaled[:,last_selected_idx]*x_scaled[:,i]))
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])
        last_selected_idx = argmax

    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features

In [6]:
def mrmr_medium(df:pd.DataFrame, target:str, k:int) -> list[str]:

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    # compute F-statistics and initialize correlation matrix
    start = perf_counter()
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # repeat K times
    for i in range(k):
    
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)
            
        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

    # print(selected) 
    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected 

In [7]:
mrmr_my_rewrite(df_pl, "target", 50)

Spent 0.07s to compute f score.
Took 1.49s to compute mrmr.


['column_82',
 'column_16',
 'column_84',
 'column_14',
 'column_26',
 'column_36',
 'column_38',
 'column_23',
 'column_70',
 'column_41',
 'column_94',
 'column_3',
 'column_7',
 'column_57',
 'column_44',
 'column_76',
 'column_42',
 'column_98',
 'column_71',
 'column_92',
 'column_62',
 'column_91',
 'column_56',
 'column_96',
 'column_79',
 'column_97',
 'column_40',
 'column_17',
 'column_27',
 'column_99',
 'column_30',
 'column_66',
 'column_2',
 'column_33',
 'column_1',
 'column_24',
 'column_58',
 'column_80',
 'column_72',
 'column_47',
 'column_59',
 'column_52',
 'column_50',
 'column_9',
 'column_37',
 'column_12',
 'column_45',
 'column_68',
 'column_25',
 'column_77']

In [8]:
# Use my version of f_score computation.
# Use pure NumPy for the rest of the computation.
# This shows that copying to NumPy, does not pay off. Stay in Polars is a better idea.
mrmr_np_rewrite(df_pl, "target", 50)

Spent 0.07s to compute f score.
Took 5.50s to compute mrmr.


['column_82',
 'column_16',
 'column_84',
 'column_14',
 'column_26',
 'column_36',
 'column_38',
 'column_23',
 'column_70',
 'column_41',
 'column_94',
 'column_3',
 'column_7',
 'column_57',
 'column_44',
 'column_76',
 'column_42',
 'column_98',
 'column_71',
 'column_92',
 'column_62',
 'column_91',
 'column_56',
 'column_96',
 'column_79',
 'column_97',
 'column_40',
 'column_17',
 'column_27',
 'column_99',
 'column_30',
 'column_66',
 'column_2',
 'column_33',
 'column_1',
 'column_24',
 'column_58',
 'column_80',
 'column_72',
 'column_47',
 'column_59',
 'column_52',
 'column_50',
 'column_9',
 'column_37',
 'column_12',
 'column_45',
 'column_68',
 'column_25',
 'column_77']

In [9]:
mrmr_medium(df_pd, "target", 50)

Spent 0.17s to compute f score.
Took 25.46s to compute mrmr.


['column_82',
 'column_16',
 'column_84',
 'column_14',
 'column_26',
 'column_36',
 'column_38',
 'column_23',
 'column_70',
 'column_41',
 'column_94',
 'column_3',
 'column_7',
 'column_57',
 'column_44',
 'column_76',
 'column_42',
 'column_98',
 'column_71',
 'column_92',
 'column_62',
 'column_91',
 'column_56',
 'column_96',
 'column_79',
 'column_97',
 'column_40',
 'column_17',
 'column_27',
 'column_99',
 'column_30',
 'column_66',
 'column_2',
 'column_33',
 'column_1',
 'column_24',
 'column_58',
 'column_80',
 'column_72',
 'column_47',
 'column_59',
 'column_52',
 'column_50',
 'column_9',
 'column_37',
 'column_12',
 'column_45',
 'column_68',
 'column_25',
 'column_77']

In [10]:
# My yet to be published package. 9.2s
mrmr(df_pl, target="target", k = 50)

Running F-score to determine feature relevance...
Top 5 feature importance by MRMR_STRATEGY.F_SCORE is:
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_82 ┆ 109313.720853         │
│ column_14 ┆ 105998.288316         │
│ column_84 ┆ 84413.59341           │
│ column_26 ┆ 83190.662637          │
│ column_70 ┆ 83042.301099          │
└───────────┴───────────────────────┘
Found 100 total features to select from. Proceeding to select top 50 features.


100%|██████████| 50/50 [00:01<00:00, 35.06it/s]


mrmr_rank,feature
i64,str
1,"""column_82"""
2,"""column_16"""
3,"""column_84"""
4,"""column_14"""
5,"""column_26"""
6,"""column_36"""
7,"""column_38"""
8,"""column_23"""
9,"""column_70"""
10,"""column_41"""


In [11]:
# Low memory mode will try to reduce memory usage in the selection process.
# Even in low memory mode, this is faster than the packages out there.
mrmr(df_pl, target="target", k = 50, low_memory=True)

Running F-score to determine feature relevance...
Top 5 feature importance by MRMR_STRATEGY.F_SCORE is:
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_82 ┆ 109313.720853         │
│ column_14 ┆ 105998.288316         │
│ column_84 ┆ 84413.59341           │
│ column_26 ┆ 83190.662637          │
│ column_70 ┆ 83042.301099          │
└───────────┴───────────────────────┘
Found 100 total features to select from. Proceeding to select top 50 features.


100%|██████████| 50/50 [00:10<00:00,  4.55it/s]


mrmr_rank,feature
i64,str
1,"""column_82"""
2,"""column_16"""
3,"""column_84"""
4,"""column_14"""
5,"""column_26"""
6,"""column_36"""
7,"""column_38"""
8,"""column_23"""
9,"""column_70"""
10,"""column_41"""


In [12]:
from mrmr import mrmr_classif

In [13]:
#
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    
    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [14]:
mrmr_package(df_pd, "target", 50)

100%|██████████| 50/50 [00:34<00:00,  1.46it/s]

Spent 38.50s to compute mrmr.





['column_82',
 'column_16',
 'column_84',
 'column_14',
 'column_26',
 'column_36',
 'column_38',
 'column_23',
 'column_70',
 'column_41',
 'column_94',
 'column_3',
 'column_7',
 'column_57',
 'column_44',
 'column_76',
 'column_42',
 'column_98',
 'column_71',
 'column_92',
 'column_62',
 'column_91',
 'column_56',
 'column_96',
 'column_79',
 'column_97',
 'column_40',
 'column_17',
 'column_27',
 'column_99',
 'column_30',
 'column_66',
 'column_2',
 'column_33',
 'column_1',
 'column_24',
 'column_58',
 'column_80',
 'column_72',
 'column_47',
 'column_59',
 'column_52',
 'column_50',
 'column_9',
 'column_37',
 'column_12',
 'column_45',
 'column_68',
 'column_25',
 'column_77']

# Demo of MRMR from my Package.

In [15]:
# No num_cols is provided. It will automatically detect and use numerical columns.
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.RF, params={"n_estimators":10, "max_depth":5, "n_jobs":-1})

Running Random Forest to determine feature relevance...
Random forest is not deterministic by default. Results may vary.
Top 5 feature importance by MRMR_STRATEGY.RF is:
shape: (5, 2)
┌───────────┬──────────────────┐
│ feature   ┆ MRMR_STRATEGY.RF │
│ ---       ┆ ---              │
│ str       ┆ f64              │
╞═══════════╪══════════════════╡
│ column_82 ┆ 0.146003         │
│ column_14 ┆ 0.074457         │
│ column_70 ┆ 0.063351         │
│ column_84 ┆ 0.044327         │
│ column_94 ┆ 0.043542         │
└───────────┴──────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.


100%|██████████| 20/20 [00:00<00:00, 28.88it/s]


mrmr_rank,feature
i64,str
1,"""column_82"""
2,"""column_87"""
3,"""column_44"""
4,"""column_84"""
5,"""column_14"""
6,"""column_96"""
7,"""column_70"""
8,"""column_26"""
9,"""column_23"""
10,"""column_7"""


In [16]:
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.XGB, params={"n_estimators":10, "max_depth":5})

Running XGBoost to determine feature relevance...
XGB is not deterministic by default. Results may vary.
Top 5 feature importance by MRMR_STRATEGY.XGB is:
shape: (5, 2)
┌───────────┬───────────────────┐
│ feature   ┆ MRMR_STRATEGY.XGB │
│ ---       ┆ ---               │
│ str       ┆ f64               │
╞═══════════╪═══════════════════╡
│ column_84 ┆ 0.086082          │
│ column_82 ┆ 0.082796          │
│ column_50 ┆ 0.04561           │
│ column_59 ┆ 0.043795          │
│ column_35 ┆ 0.040338          │
└───────────┴───────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.


100%|██████████| 20/20 [00:00<00:00, 28.88it/s]


mrmr_rank,feature
i64,str
1,"""column_84"""
2,"""column_35"""
3,"""column_0"""
4,"""column_44"""
5,"""column_82"""
6,"""column_50"""
7,"""column_21"""
8,"""column_87"""
9,"""column_59"""
10,"""column_16"""


In [17]:
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.MIS)

Running Mutual Information Score to determine feature relevance...


100%|██████████| 100/100 [00:48<00:00,  2.06it/s]


Top 5 feature importance by MRMR_STRATEGY.MIS is:
shape: (5, 2)
┌───────────┬───────────────────┐
│ feature   ┆ MRMR_STRATEGY.MIS │
│ ---       ┆ ---               │
│ str       ┆ f64               │
╞═══════════╪═══════════════════╡
│ column_82 ┆ 0.091703          │
│ column_14 ┆ 0.081395          │
│ column_84 ┆ 0.07264           │
│ column_70 ┆ 0.066328          │
│ column_26 ┆ 0.065226          │
└───────────┴───────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.


100%|██████████| 20/20 [00:00<00:00, 29.72it/s]


mrmr_rank,feature
i64,str
1,"""column_82"""
2,"""column_16"""
3,"""column_46"""
4,"""column_94"""
5,"""column_36"""
6,"""column_14"""
7,"""column_96"""
8,"""column_26"""
9,"""column_38"""
10,"""column_84"""


In [18]:
# Inspired by Featurewiz's SULOV
knock_out_mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.F_SCORE)

Running F-score to determine feature relevance...
Top 5 feature importance by MRMR_STRATEGY.F_SCORE is:
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_82 ┆ 109313.720853         │
│ column_14 ┆ 105998.288316         │
│ column_84 ┆ 84413.59341           │
│ column_26 ┆ 83190.662637          │
│ column_70 ┆ 83042.301099          │
└───────────┴───────────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.
For knock out MRMR, it is possible for the process to end without finding enough variables. Try making the correlation threshold higher to make more variables qualify.


100%|██████████| 20/20 [00:00<?, ?it/s]


mrmr_rank,feature
i64,str
1,"""column_82"""
2,"""column_14"""
3,"""column_84"""
4,"""column_26"""
5,"""column_70"""
6,"""column_36"""
7,"""column_38"""
8,"""column_23"""
9,"""column_94"""
10,"""column_57"""
