In [1]:
import polars as pl 
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import f_regression
import pandas as pd
from time import perf_counter
import sys
sys.path.append('../src')
from eda.eda_selection import _f_score, mrmr, MRMR_STRATEGY, knock_out_mrmr

In [2]:
# create some data
orig_x, orig_y = make_classification(n_samples = 500_000, n_features = 100, n_informative = 20, n_redundant = 80)

In [3]:
df_pl = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y))
df_pd = df_pl.to_pandas()

In [4]:
del orig_x
del orig_y

In [5]:
def mrmr_my_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    df_scaled = df.select(features).with_columns(
        (pl.col(f) - pl.col(f).mean())/pl.col(f).std() for f in features
    )

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        last_selected = selected_features[-1]
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs((df_scaled.get_column(last_selected)*df_scaled.get_column(f)).mean())
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])

    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features
    

In [6]:
def mrmr_np_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    x = df.select(features).to_numpy()
    x_scaled = (x - x.mean(axis=0)) / x.std(axis=0)

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    last_selected_idx = top_idx
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs(np.mean(x_scaled[:,last_selected_idx]*x_scaled[:,i]))
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])
        last_selected_idx = argmax

    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features

In [7]:
def mrmr_medium(df:pd.DataFrame, target:str, k:int) -> list[str]:

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    # compute F-statistics and initialize correlation matrix
    start = perf_counter()
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # repeat K times
    for i in range(k):
    
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)
            
        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

    # print(selected) 
    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected 

In [8]:
mrmr_my_rewrite(df_pl, "target", 50)

Spent 0.06s to compute f score.
Took 1.25s to compute mrmr.


['column_40',
 'column_49',
 'column_58',
 'column_75',
 'column_45',
 'column_71',
 'column_39',
 'column_36',
 'column_74',
 'column_14',
 'column_32',
 'column_6',
 'column_27',
 'column_10',
 'column_87',
 'column_96',
 'column_85',
 'column_55',
 'column_26',
 'column_67',
 'column_16',
 'column_19',
 'column_95',
 'column_29',
 'column_60',
 'column_66',
 'column_76',
 'column_65',
 'column_79',
 'column_72',
 'column_13',
 'column_0',
 'column_77',
 'column_46',
 'column_86',
 'column_8',
 'column_89',
 'column_69',
 'column_41',
 'column_23',
 'column_99',
 'column_2',
 'column_9',
 'column_59',
 'column_52',
 'column_61',
 'column_54',
 'column_68',
 'column_73',
 'column_38']

In [9]:
# Use my version of f_score computation.
# Use pure NumPy for the rest of the computation.
# This shows that copying to NumPy, does not pay off. Stay in Polars is a better idea.
mrmr_np_rewrite(df_pl, "target", 50)

Spent 0.06s to compute f score.
Took 5.25s to compute mrmr.


['column_40',
 'column_49',
 'column_58',
 'column_75',
 'column_45',
 'column_71',
 'column_39',
 'column_36',
 'column_74',
 'column_14',
 'column_32',
 'column_6',
 'column_27',
 'column_10',
 'column_87',
 'column_96',
 'column_85',
 'column_55',
 'column_26',
 'column_67',
 'column_16',
 'column_19',
 'column_95',
 'column_29',
 'column_60',
 'column_66',
 'column_76',
 'column_65',
 'column_79',
 'column_72',
 'column_13',
 'column_0',
 'column_77',
 'column_46',
 'column_86',
 'column_8',
 'column_89',
 'column_69',
 'column_41',
 'column_23',
 'column_99',
 'column_2',
 'column_9',
 'column_59',
 'column_52',
 'column_61',
 'column_54',
 'column_68',
 'column_73',
 'column_38']

In [10]:
mrmr_medium(df_pd, "target", 50)

Spent 0.14s to compute f score.
Took 21.93s to compute mrmr.


['column_40',
 'column_49',
 'column_58',
 'column_75',
 'column_45',
 'column_71',
 'column_39',
 'column_36',
 'column_74',
 'column_14',
 'column_32',
 'column_6',
 'column_27',
 'column_10',
 'column_87',
 'column_96',
 'column_85',
 'column_55',
 'column_26',
 'column_67',
 'column_16',
 'column_19',
 'column_95',
 'column_29',
 'column_60',
 'column_66',
 'column_76',
 'column_65',
 'column_79',
 'column_72',
 'column_13',
 'column_0',
 'column_77',
 'column_46',
 'column_86',
 'column_8',
 'column_89',
 'column_69',
 'column_41',
 'column_23',
 'column_99',
 'column_2',
 'column_9',
 'column_59',
 'column_52',
 'column_61',
 'column_54',
 'column_68',
 'column_73',
 'column_38']

In [11]:
# My yet to be published package.
mrmr(df_pl, target="target", k = 50)

Running F-score to determine feature relevance...
Top 5 feature importance by MRMR_STRATEGY.F_SCORE is:
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_40 ┆ 84106.046915          │
│ column_39 ┆ 75069.811993          │
│ column_71 ┆ 69544.622042          │
│ column_74 ┆ 52024.513348          │
│ column_14 ┆ 50738.037926          │
└───────────┴───────────────────────┘
Found 100 total features to select from. Proceeding to select top 50 features.


100%|██████████| 50/50 [00:01<00:00, 44.06it/s]

Output is sorted in order of selection. (The 1st feature selected is most important, the 2nd the 2nd most important, etc.)





['column_40',
 'column_49',
 'column_58',
 'column_75',
 'column_45',
 'column_71',
 'column_39',
 'column_36',
 'column_74',
 'column_14',
 'column_32',
 'column_6',
 'column_27',
 'column_10',
 'column_87',
 'column_96',
 'column_85',
 'column_55',
 'column_26',
 'column_67',
 'column_16',
 'column_19',
 'column_95',
 'column_29',
 'column_60',
 'column_66',
 'column_76',
 'column_65',
 'column_79',
 'column_72',
 'column_13',
 'column_0',
 'column_77',
 'column_46',
 'column_86',
 'column_8',
 'column_89',
 'column_69',
 'column_41',
 'column_23',
 'column_99',
 'column_2',
 'column_9',
 'column_59',
 'column_52',
 'column_61',
 'column_54',
 'column_68',
 'column_73',
 'column_38']

In [12]:
# Low memory mode will try to reduce memory usage in the selection process.
# Even in low memory mode, this is faster than the packages out there.
mrmr(df_pl, target="target", k = 50, low_memory=True)

Running F-score to determine feature relevance...
Top 5 feature importance by MRMR_STRATEGY.F_SCORE is:
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_40 ┆ 84106.046915          │
│ column_39 ┆ 75069.811993          │
│ column_71 ┆ 69544.622042          │
│ column_74 ┆ 52024.513348          │
│ column_14 ┆ 50738.037926          │
└───────────┴───────────────────────┘
Found 100 total features to select from. Proceeding to select top 50 features.


100%|██████████| 50/50 [00:08<00:00,  6.13it/s]

Output is sorted in order of selection. (The 1st feature selected is most important, the 2nd the 2nd most important, etc.)





['column_40',
 'column_49',
 'column_58',
 'column_75',
 'column_45',
 'column_71',
 'column_39',
 'column_36',
 'column_74',
 'column_14',
 'column_32',
 'column_6',
 'column_27',
 'column_10',
 'column_87',
 'column_96',
 'column_85',
 'column_55',
 'column_26',
 'column_67',
 'column_16',
 'column_19',
 'column_95',
 'column_29',
 'column_60',
 'column_66',
 'column_76',
 'column_65',
 'column_79',
 'column_72',
 'column_13',
 'column_0',
 'column_77',
 'column_46',
 'column_86',
 'column_8',
 'column_89',
 'column_69',
 'column_41',
 'column_23',
 'column_99',
 'column_2',
 'column_9',
 'column_59',
 'column_52',
 'column_61',
 'column_54',
 'column_68',
 'column_73',
 'column_38']

In [13]:
from mrmr import mrmr_classif

In [14]:
#
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    
    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [15]:
mrmr_package(df_pd, "target", 50)

100%|██████████| 50/50 [00:29<00:00,  1.68it/s]

Spent 33.59s to compute mrmr.





['column_40',
 'column_49',
 'column_58',
 'column_75',
 'column_45',
 'column_71',
 'column_39',
 'column_36',
 'column_74',
 'column_14',
 'column_32',
 'column_6',
 'column_27',
 'column_10',
 'column_87',
 'column_96',
 'column_85',
 'column_55',
 'column_26',
 'column_67',
 'column_16',
 'column_19',
 'column_95',
 'column_29',
 'column_60',
 'column_66',
 'column_76',
 'column_65',
 'column_79',
 'column_72',
 'column_13',
 'column_0',
 'column_77',
 'column_46',
 'column_86',
 'column_8',
 'column_89',
 'column_69',
 'column_41',
 'column_23',
 'column_99',
 'column_2',
 'column_9',
 'column_59',
 'column_52',
 'column_61',
 'column_54',
 'column_68',
 'column_73',
 'column_38']

# Demo of MRMR from my Package.

In [16]:
# No num_cols is provided. It will automatically detect and use numerical columns.
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.RF, params={"n_estimators":10, "max_depth":5, "n_jobs":-1})

Running Random Forest to determine feature relevance...
Random forest is not deterministic by default. Results may vary.
Top 5 feature importance by MRMR_STRATEGY.RF is:
shape: (5, 2)
┌───────────┬──────────────────┐
│ feature   ┆ MRMR_STRATEGY.RF │
│ ---       ┆ ---              │
│ str       ┆ f64              │
╞═══════════╪══════════════════╡
│ column_40 ┆ 0.105611         │
│ column_39 ┆ 0.094682         │
│ column_32 ┆ 0.083981         │
│ column_71 ┆ 0.072361         │
│ column_96 ┆ 0.049674         │
└───────────┴──────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.


100%|██████████| 20/20 [00:00<00:00, 37.88it/s]

Output is sorted in order of selection. (The 1st feature selected is most important, the 2nd the 2nd most important, etc.)





['column_40',
 'column_49',
 'column_32',
 'column_39',
 'column_71',
 'column_89',
 'column_6',
 'column_96',
 'column_78',
 'column_77',
 'column_87',
 'column_11',
 'column_27',
 'column_5',
 'column_69',
 'column_73',
 'column_66',
 'column_29',
 'column_16',
 'column_55']

In [17]:
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.XGB, params={"n_estimators":10, "max_depth":5})

Running XGBoost to determine feature relevance...
XGB is not deterministic by default. Results may vary.
Top 5 feature importance by MRMR_STRATEGY.XGB is:
shape: (5, 2)
┌───────────┬───────────────────┐
│ feature   ┆ MRMR_STRATEGY.XGB │
│ ---       ┆ ---               │
│ str       ┆ f64               │
╞═══════════╪═══════════════════╡
│ column_40 ┆ 0.102606          │
│ column_27 ┆ 0.072671          │
│ column_39 ┆ 0.05256           │
│ column_79 ┆ 0.030445          │
│ column_64 ┆ 0.028942          │
└───────────┴───────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.


100%|██████████| 20/20 [00:00<00:00, 37.61it/s]

Output is sorted in order of selection. (The 1st feature selected is most important, the 2nd the 2nd most important, etc.)





['column_40',
 'column_49',
 'column_64',
 'column_50',
 'column_79',
 'column_27',
 'column_39',
 'column_32',
 'column_87',
 'column_45',
 'column_69',
 'column_19',
 'column_91',
 'column_77',
 'column_83',
 'column_11',
 'column_31',
 'column_71',
 'column_89',
 'column_13']

In [18]:
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.MIS)

Running Mutual Information Score to determine feature relevance...


100%|██████████| 100/100 [00:39<00:00,  2.53it/s]


Top 5 feature importance by MRMR_STRATEGY.MIS is:
shape: (5, 2)
┌───────────┬───────────────────┐
│ feature   ┆ MRMR_STRATEGY.MIS │
│ ---       ┆ ---               │
│ str       ┆ f64               │
╞═══════════╪═══════════════════╡
│ column_40 ┆ 0.092849          │
│ column_39 ┆ 0.076936          │
│ column_71 ┆ 0.07222           │
│ column_32 ┆ 0.068808          │
│ column_11 ┆ 0.053687          │
└───────────┴───────────────────┘
Found 100 total features to select from. Proceeding to select top 20 features.


100%|██████████| 20/20 [00:00<00:00, 39.67it/s]

Output is sorted in order of selection. (The 1st feature selected is most important, the 2nd the 2nd most important, etc.)





['column_40',
 'column_49',
 'column_58',
 'column_63',
 'column_75',
 'column_45',
 'column_11',
 'column_32',
 'column_39',
 'column_71',
 'column_36',
 'column_74',
 'column_6',
 'column_14',
 'column_90',
 'column_16',
 'column_96',
 'column_10',
 'column_27',
 'column_26']

In [19]:
# Inspired by Featurewiz's SULOV
knock_out_mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.F_SCORE)

Running F-score to determine feature relevance...
Top 5 feature importance by MRMR_STRATEGY.F_SCORE is:
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_40 ┆ 84106.046915          │
│ column_39 ┆ 75069.811993          │
│ column_71 ┆ 69544.622042          │
│ column_74 ┆ 52024.513348          │
│ column_14 ┆ 50738.037926          │
└───────────┴───────────────────────┘


100%|██████████| 20/20 [00:00<?, ?it/s]

Output is sorted in order of selection. (The 1st feature selected is most important, the 2nd the 2nd most important, etc.)





['column_40',
 'column_39',
 'column_71',
 'column_74',
 'column_14',
 'column_32',
 'column_75',
 'column_45',
 'column_10',
 'column_27',
 'column_49',
 'column_67',
 'column_96',
 'column_36',
 'column_65',
 'column_55',
 'column_16',
 'column_19',
 'column_6',
 'column_95']