In [2]:
import polars as pl 
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import f_regression
import pandas as pd
from time import perf_counter
import sys
sys.path.append('../src')
from dsds.fs import _f_score, mrmr, knock_out_mrmr

# Great Article and Introduction to MRMR Types of Feature Selection Algorithms

https://towardsdatascience.com/mrmr-explained-exactly-how-you-wished-someone-explained-to-you-9cf4ed27458b

In [3]:
# create some data
orig_x, orig_y = make_classification(n_samples = 100_000, n_features = 50, n_informative = 20, n_redundant = 30)

In [4]:
df_pl = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y))
df_pd = df_pl.to_pandas()

In [5]:
del orig_x
del orig_y

In [6]:
def mrmr_my_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    df_scaled = df.select(features).with_columns(
        (pl.col(f) - pl.col(f).mean())/pl.col(f).std() for f in features
    )

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        last_selected = selected_features[-1]
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs((df_scaled.get_column(last_selected)*df_scaled.get_column(f)).mean())
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])

    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features
    

In [7]:
def mrmr_np_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    x = df.select(features).to_numpy()
    x_scaled = (x - x.mean(axis=0)) / x.std(axis=0)

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    last_selected_idx = top_idx
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs(np.mean(x_scaled[:,last_selected_idx]*x_scaled[:,i]))
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])
        last_selected_idx = argmax

    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features

In [8]:
def mrmr_medium(df:pd.DataFrame, target:str, k:int) -> list[str]:

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    # compute F-statistics and initialize correlation matrix
    start = perf_counter()
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # repeat K times
    for i in range(k):
    
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)
            
        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

    # print(selected) 
    end_2 = perf_counter()
    print(f"Took {end_2 - end_1:.2f}s to compute mrmr.")
    return selected 

In [9]:
mrmr_my_rewrite(df_pl, "target", 50)

Spent 0.16s to compute f score.
Took 0.11s to compute mrmr.


['column_10',
 'column_20',
 'column_36',
 'column_17',
 'column_3',
 'column_47',
 'column_49',
 'column_12',
 'column_43',
 'column_19',
 'column_22',
 'column_44',
 'column_7',
 'column_6',
 'column_46',
 'column_2',
 'column_14',
 'column_37',
 'column_4',
 'column_8',
 'column_26',
 'column_21',
 'column_13',
 'column_33',
 'column_48',
 'column_0',
 'column_5',
 'column_18',
 'column_45',
 'column_32',
 'column_1',
 'column_29',
 'column_31',
 'column_9',
 'column_16',
 'column_24',
 'column_39',
 'column_25',
 'column_42',
 'column_23',
 'column_34',
 'column_27',
 'column_28',
 'column_40',
 'column_15',
 'column_30',
 'column_35',
 'column_11',
 'column_41',
 'column_38']

In [10]:
# Use my version of f_score computation.
# Use pure NumPy for the rest of the computation.
# This shows that copying to NumPy, does not pay off. Stay in Polars is a better idea.
mrmr_np_rewrite(df_pl, "target", 50)

Spent 0.02s to compute f score.
Took 0.16s to compute mrmr.


['column_10',
 'column_20',
 'column_36',
 'column_17',
 'column_3',
 'column_47',
 'column_49',
 'column_12',
 'column_43',
 'column_19',
 'column_22',
 'column_44',
 'column_7',
 'column_6',
 'column_46',
 'column_2',
 'column_14',
 'column_37',
 'column_4',
 'column_8',
 'column_26',
 'column_21',
 'column_13',
 'column_33',
 'column_48',
 'column_0',
 'column_5',
 'column_18',
 'column_45',
 'column_32',
 'column_1',
 'column_29',
 'column_31',
 'column_9',
 'column_16',
 'column_24',
 'column_39',
 'column_25',
 'column_42',
 'column_23',
 'column_34',
 'column_27',
 'column_28',
 'column_40',
 'column_15',
 'column_30',
 'column_35',
 'column_11',
 'column_41',
 'column_38']

In [11]:
mrmr_medium(df_pd, "target", 50)

Spent 0.01s to compute f score.
Took 1.65s to compute mrmr.


['column_10',
 'column_20',
 'column_36',
 'column_17',
 'column_3',
 'column_47',
 'column_49',
 'column_12',
 'column_43',
 'column_19',
 'column_22',
 'column_44',
 'column_7',
 'column_6',
 'column_46',
 'column_2',
 'column_14',
 'column_37',
 'column_4',
 'column_8',
 'column_26',
 'column_21',
 'column_13',
 'column_33',
 'column_48',
 'column_0',
 'column_5',
 'column_18',
 'column_45',
 'column_32',
 'column_1',
 'column_29',
 'column_31',
 'column_9',
 'column_16',
 'column_24',
 'column_39',
 'column_25',
 'column_42',
 'column_23',
 'column_34',
 'column_27',
 'column_28',
 'column_40',
 'column_15',
 'column_30',
 'column_35',
 'column_11',
 'column_41',
 'column_38']

In [12]:
# My yet to be published package.
mrmr(df_pl, target="target", k = 50)

Running fscore to determine feature relevance...
Found 50 total features to select from. Proceeding to select top 50 features.


MRMR, fscore: 100%|██████████| 50/50 [00:00<00:00, 462.73it/s]

Output is sorted in order of selection (max relevance min redundancy).





['column_10',
 'column_20',
 'column_36',
 'column_17',
 'column_3',
 'column_47',
 'column_49',
 'column_12',
 'column_43',
 'column_19',
 'column_22',
 'column_44',
 'column_7',
 'column_6',
 'column_46',
 'column_2',
 'column_14',
 'column_37',
 'column_4',
 'column_8',
 'column_26',
 'column_21',
 'column_13',
 'column_33',
 'column_48',
 'column_0',
 'column_5',
 'column_18',
 'column_45',
 'column_32',
 'column_1',
 'column_29',
 'column_31',
 'column_9',
 'column_16',
 'column_24',
 'column_39',
 'column_25',
 'column_42',
 'column_23',
 'column_34',
 'column_27',
 'column_28',
 'column_40',
 'column_15',
 'column_30',
 'column_35',
 'column_11',
 'column_41',
 'column_38']

In [13]:
# Low memory mode will try to reduce memory usage in the selection process.
# Even in low memory mode, this is faster than the packages out there.
mrmr(df_pl, target="target", k = 50, low_memory=True)

Running fscore to determine feature relevance...
Found 50 total features to select from. Proceeding to select top 50 features.


MRMR, fscore: 100%|██████████| 50/50 [00:00<00:00, 71.78it/s]

Output is sorted in order of selection (max relevance min redundancy).





['column_10',
 'column_20',
 'column_36',
 'column_17',
 'column_3',
 'column_47',
 'column_49',
 'column_12',
 'column_43',
 'column_19',
 'column_22',
 'column_44',
 'column_7',
 'column_6',
 'column_46',
 'column_2',
 'column_14',
 'column_37',
 'column_4',
 'column_8',
 'column_26',
 'column_21',
 'column_13',
 'column_33',
 'column_48',
 'column_0',
 'column_5',
 'column_18',
 'column_45',
 'column_32',
 'column_1',
 'column_29',
 'column_31',
 'column_9',
 'column_16',
 'column_24',
 'column_39',
 'column_25',
 'column_42',
 'column_23',
 'column_34',
 'column_27',
 'column_28',
 'column_40',
 'column_15',
 'column_30',
 'column_35',
 'column_11',
 'column_41',
 'column_38']

In [14]:
from mrmr import mrmr_classif

In [15]:
#
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    
    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]
    start = perf_counter()
    output = mrmr_classif(X, y, K = k)
    end = perf_counter()
    print(f"Spent {end - start:.2f}s to compute mrmr.")
    return output

In [16]:
mrmr_package(df_pd, "target", 50)

100%|██████████| 50/50 [00:09<00:00,  5.26it/s]

Spent 11.14s to compute mrmr.





['column_10',
 'column_20',
 'column_36',
 'column_17',
 'column_3',
 'column_47',
 'column_49',
 'column_12',
 'column_43',
 'column_19',
 'column_22',
 'column_44',
 'column_7',
 'column_6',
 'column_46',
 'column_2',
 'column_14',
 'column_37',
 'column_4',
 'column_8',
 'column_26',
 'column_21',
 'column_13',
 'column_33',
 'column_48',
 'column_0',
 'column_5',
 'column_18',
 'column_45',
 'column_32',
 'column_1',
 'column_29',
 'column_31',
 'column_9',
 'column_16',
 'column_24',
 'column_39',
 'column_25',
 'column_42',
 'column_23',
 'column_34',
 'column_27',
 'column_28',
 'column_40',
 'column_15',
 'column_30',
 'column_35',
 'column_11',
 'column_41',
 'column_38']

# Demo of MRMR from my Package.

In [17]:
# No num_cols is provided. It will automatically detect and use numerical columns.
mrmr(df_pl, target="target", k=20, strategy="rf"
            , params={"n_estimators":10, "max_depth":5, "n_jobs":-1})

Running rf to determine feature relevance...
Random forest is not deterministic by default. Results may vary.
Found 50 total features to select from. Proceeding to select top 20 features.


MRMR, rf: 100%|██████████| 20/20 [00:00<00:00, 379.87it/s]

Output is sorted in order of selection (max relevance min redundancy).





['column_10',
 'column_21',
 'column_12',
 'column_47',
 'column_3',
 'column_17',
 'column_43',
 'column_14',
 'column_33',
 'column_18',
 'column_49',
 'column_26',
 'column_6',
 'column_44',
 'column_4',
 'column_45',
 'column_25',
 'column_36',
 'column_13',
 'column_27']

In [18]:
mrmr(df_pl, target="target", k=20, strategy="lgbm"
            , params={"n_estimators":10, "max_depth":5})

Running lgbm to determine feature relevance...
LightGBM is not deterministic by default. Results may vary.
Found 50 total features to select from. Proceeding to select top 20 features.


MRMR, lgbm: 100%|██████████| 20/20 [00:00<00:00, 353.16it/s]

Output is sorted in order of selection (max relevance min redundancy).





['column_21',
 'column_44',
 'column_43',
 'column_30',
 'column_14',
 'column_17',
 'column_47',
 'column_49',
 'column_20',
 'column_7',
 'column_2',
 'column_6',
 'column_12',
 'column_33',
 'column_8',
 'column_19',
 'column_13',
 'column_36',
 'column_35',
 'column_26']

In [19]:
mrmr(df_pl, target="target", k=20, strategy="mis")

Running mis to determine feature relevance...


Mutual Info: 100%|██████████| 50/50 [00:05<00:00,  8.94it/s]


Found 50 total features to select from. Proceeding to select top 20 features.


MRMR, mis: 100%|██████████| 20/20 [00:00<00:00, 392.38it/s]

Output is sorted in order of selection (max relevance min redundancy).





['column_10',
 'column_29',
 'column_33',
 'column_3',
 'column_49',
 'column_47',
 'column_12',
 'column_43',
 'column_14',
 'column_26',
 'column_21',
 'column_22',
 'column_23',
 'column_6',
 'column_4',
 'column_45',
 'column_42',
 'column_1',
 'column_48',
 'column_20']

In [20]:
# Inspired by Featurewiz's SULOV
knock_out_mrmr(df_pl, target="target", k=20, strategy="f")

Running f to determine feature relevance...


100%|██████████| 20/20 [00:00<00:00, 121750.48it/s]

Output is sorted in order of selection (max relevance min redundancy).





['column_10',
 'column_12',
 'column_47',
 'column_3',
 'column_22',
 'column_49',
 'column_14',
 'column_26',
 'column_33',
 'column_20',
 'column_4',
 'column_37',
 'column_6',
 'column_43',
 'column_5',
 'column_36',
 'column_48',
 'column_2',
 'column_44',
 'column_7']