In [1]:
import polars as pl 
import numpy as np
from eda_utils import _f_score, mrmr, MRMR_STRATEGY
from sklearn.datasets import make_classification
from sklearn.feature_selection import f_regression
import pandas as pd
from time import perf_counter
import os 

In [2]:
# create some data
orig_x, orig_y = make_classification(n_samples = 1_000_000, n_features = 50, n_informative = 20, n_redundant = 30)

In [3]:
df_pl = pl.from_numpy(orig_x).insert_at_idx(0, pl.Series("target", orig_y))
df_pd = df_pl.to_pandas()

In [4]:
def mrmr_my_rewrite(df:pl.DataFrame, target:str, k:int) -> list[str]:

    features = df.columns
    features.remove(target)

    start = perf_counter()
    f_scores = _f_score(df, target, features)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    df_scaled = df.select(features).with_columns(
        (pl.col(f) - pl.col(f).mean())/pl.col(f).std() for f in features
    )

    cumulating_sum = np.zeros(len(features)) # For each feature at index i, we keep a cumulating sum
    top_idx = np.argmax(f_scores)
    selected_features = [features[top_idx]]
    for j in range(1, k): 
        argmax = -1
        current_max = -1
        last_selected = selected_features[-1]
        for i,f in enumerate(features):
            if f not in selected_features:
                # Left = cumulating sum of abs corr
                # Right = abs correlation btw last_selected and f
                cumulating_sum[i] += np.abs((df_scaled.get_column(last_selected)*df_scaled.get_column(f)).mean())
                denominator = cumulating_sum[i] / j
                new_score = f_scores[i] / denominator
                if new_score > current_max:
                    current_max = new_score
                    argmax = i

        selected_features.append(features[argmax])

    end_2 = perf_counter()
    print(f"Spent {end_2 - end_1:.2f}s to compute mrmr.")
    return selected_features
    

In [5]:
def mrmr_medium(df:pd.DataFrame, target:str, k:int) -> list[str]:

    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    # compute F-statistics and initialize correlation matrix
    start = perf_counter()
    F = pd.Series(f_regression(X, y)[0], index = X.columns)
    end_1 = perf_counter()
    print(f"Spent {end_1 - start:.2f}s to compute f score.")

    corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

    # initialize list of selected features and list of excluded features
    selected = []
    not_selected = X.columns.to_list()

    # repeat K times
    for i in range(k):
    
        # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
        if i > 0:
            last_selected = selected[-1]
            corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001)
            
        # compute FCQ score for all the (currently) excluded features (this is Formula 2)
        score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
        
        # find best feature, add it to selected and remove it from not_selected
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)

    # print(selected) 
    end_2 = perf_counter()
    print(f"Spent {end_2 - end_1:.2f}s to compute mrmr.")
    return selected 

In [6]:
mrmr_my_rewrite(df_pl, "target", 20)

Spent 0.06s to compute f score.
Spent 0.88s to compute mrmr.


['column_29',
 'column_37',
 'column_21',
 'column_44',
 'column_23',
 'column_41',
 'column_33',
 'column_2',
 'column_26',
 'column_22',
 'column_27',
 'column_8',
 'column_28',
 'column_48',
 'column_5',
 'column_25',
 'column_20',
 'column_1',
 'column_31',
 'column_42']

In [7]:
mrmr_medium(df_pd, "target", 20)

Spent 0.14s to compute f score.
Spent 11.63s to compute mrmr.


['column_29',
 'column_37',
 'column_21',
 'column_44',
 'column_23',
 'column_41',
 'column_33',
 'column_2',
 'column_26',
 'column_22',
 'column_27',
 'column_8',
 'column_28',
 'column_48',
 'column_5',
 'column_25',
 'column_20',
 'column_1',
 'column_31',
 'column_42']

In [8]:
from mrmr import mrmr_classif

In [9]:
def mrmr_package(df:pd.DataFrame, target:str, k:int) -> list[str]:
    
    features = list(df.columns)
    features.remove(target)

    X = df[features]
    y = df[target]

    # use mrmr classification
    return mrmr_classif(X, y, K = k)

In [10]:
mrmr_package(df_pd, "target", 20)

100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


['column_29',
 'column_37',
 'column_21',
 'column_44',
 'column_23',
 'column_41',
 'column_33',
 'column_2',
 'column_26',
 'column_22',
 'column_27',
 'column_8',
 'column_28',
 'column_48',
 'column_5',
 'column_25',
 'column_20',
 'column_1',
 'column_31',
 'column_42']

In [11]:
# MRMR from my repo 

mrmr(df_pl, target="target", k=20)

Top 5 feature importance is (by MRMR_STRATEGY.F_SCORE):
shape: (5, 2)
┌───────────┬───────────────────────┐
│ feature   ┆ MRMR_STRATEGY.F_SCORE │
│ ---       ┆ ---                   │
│ str       ┆ f64                   │
╞═══════════╪═══════════════════════╡
│ column_29 ┆ 234355.115447         │
│ column_44 ┆ 198995.289158         │
│ column_23 ┆ 162807.990072         │
│ column_41 ┆ 142515.000045         │
│ column_33 ┆ 132316.891188         │
└───────────┴───────────────────────┘
Found 1st feature by MRMR: column_29. 1/20
Found 2th feature by MRMR: column_37. 2/20
Found 3th feature by MRMR: column_21. 3/20
Found 4th feature by MRMR: column_44. 4/20
Found 5th feature by MRMR: column_23. 5/20
Found 6th feature by MRMR: column_41. 6/20
Found 7th feature by MRMR: column_33. 7/20
Found 8th feature by MRMR: column_2. 8/20
Found 9th feature by MRMR: column_26. 9/20
Found 10th feature by MRMR: column_22. 10/20
Found 11th feature by MRMR: column_27. 11/20
Found 12th feature by MRMR: column_8

mrmr_rank,feature
i64,str
1,"""column_29"""
2,"""column_37"""
3,"""column_21"""
4,"""column_44"""
5,"""column_23"""
6,"""column_41"""
7,"""column_33"""
8,"""column_2"""
9,"""column_26"""
10,"""column_22"""


In [12]:
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.RF, params={"n_estimators":20, "max_depth":5, "n_jobs":-1})

Random forest is not deterministic by default. Results may vary.
Top 5 feature importance is (by MRMR_STRATEGY.RF):
shape: (5, 2)
┌───────────┬──────────────────┐
│ feature   ┆ MRMR_STRATEGY.RF │
│ ---       ┆ ---              │
│ str       ┆ f64              │
╞═══════════╪══════════════════╡
│ column_29 ┆ 0.1335           │
│ column_44 ┆ 0.116226         │
│ column_33 ┆ 0.09626          │
│ column_23 ┆ 0.080251         │
│ column_41 ┆ 0.058192         │
└───────────┴──────────────────┘
Found 1st feature by MRMR: column_29. 1/20
Found 2th feature by MRMR: column_31. 2/20
Found 3th feature by MRMR: column_33. 3/20
Found 4th feature by MRMR: column_23. 4/20
Found 5th feature by MRMR: column_44. 5/20
Found 6th feature by MRMR: column_41. 6/20
Found 7th feature by MRMR: column_32. 7/20
Found 8th feature by MRMR: column_2. 8/20
Found 9th feature by MRMR: column_48. 9/20
Found 10th feature by MRMR: column_22. 10/20
Found 11th feature by MRMR: column_21. 11/20
Found 12th feature by MRMR: col

mrmr_rank,feature
i64,str
1,"""column_29"""
2,"""column_31"""
3,"""column_33"""
4,"""column_23"""
5,"""column_44"""
6,"""column_41"""
7,"""column_32"""
8,"""column_2"""
9,"""column_48"""
10,"""column_22"""


In [13]:
mrmr(df_pl, target="target", k=20, strategy=MRMR_STRATEGY.XGB, params={"n_estimators":20, "max_depth":5})

XGB is not deterministic by default. Results may vary.
Top 5 feature importance is (by MRMR_STRATEGY.XGB):
shape: (5, 2)
┌───────────┬───────────────────┐
│ feature   ┆ MRMR_STRATEGY.XGB │
│ ---       ┆ ---               │
│ str       ┆ f64               │
╞═══════════╪═══════════════════╡
│ column_44 ┆ 0.222649          │
│ column_23 ┆ 0.052148          │
│ column_1  ┆ 0.04954           │
│ column_2  ┆ 0.046469          │
│ column_29 ┆ 0.042617          │
└───────────┴───────────────────┘
Found 1st feature by MRMR: column_44. 1/20
Found 2th feature by MRMR: column_13. 2/20
Found 3th feature by MRMR: column_2. 3/20
Found 4th feature by MRMR: column_23. 4/20
Found 5th feature by MRMR: column_25. 5/20
Found 6th feature by MRMR: column_1. 6/20
Found 7th feature by MRMR: column_33. 7/20
Found 8th feature by MRMR: column_48. 8/20
Found 9th feature by MRMR: column_45. 9/20
Found 10th feature by MRMR: column_41. 10/20
Found 11th feature by MRMR: column_29. 11/20
Found 12th feature by MRMR: co

mrmr_rank,feature
i64,str
1,"""column_44"""
2,"""column_13"""
3,"""column_2"""
4,"""column_23"""
5,"""column_25"""
6,"""column_1"""
7,"""column_33"""
8,"""column_48"""
9,"""column_45"""
10,"""column_41"""
