In [1]:
import polars

polars.enable_string_cache()

In [2]:
from autofeat import source

dataset = source.from_kaggle(
    name="santander-customer-transaction-prediction",
)


In [3]:
filters = (
    dataset
    .table("train.csv")
    .column("ID_code")
    .data
    .head(1000)
)

target = (
    dataset
    .table("train.csv")
    .column("target")
    .data
    .head(1000)
)

In [4]:
from autofeat.transform import Cast, Encode, Require, Select

transform = (
    Require(lambda table: table.name == "train.csv")
    .then(Select(exclude={"target"}))
    .then(Cast())
    .then(Encode())
)

In [5]:
features = (
    dataset
    .apply(transform)
    .features(filters)
    .collect()
)

In [6]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=10,
)

selector.fit(
    X=features.to_numpy(),
    y=target.collect().to_series().to_numpy(),
)

selection = [
    features.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['var_63 from train.csv',
 'var_86 from train.csv',
 'var_117 from train.csv',
 'var_122 from train.csv',
 'var_139 from train.csv',
 'var_142 from train.csv',
 'var_160 from train.csv',
 'var_166 from train.csv',
 'var_171 from train.csv',
 'var_192 from train.csv']