In [8]:
import polars

polars.enable_string_cache()

In [9]:
from autofeat import source

dataset = source.from_kaggle(
    name="santander-customer-transaction-prediction",
)


In [10]:
from autofeat.transform import Cast, Encode, Require, Select

transform = (
    Require(lambda table: table.name == "train.csv")
    .then(Select(exclude=["target"]))
    .then(Cast())
    .then(Encode())
)

In [11]:
filters = (
    dataset.table("train.csv")
    .column("ID_code")
    .data
    .head(100)
)

target = (
    dataset.table("train.csv")
    .column("target")
    .data
    .head(100)
)

In [12]:
features = dataset.features(
    filters=filters,
    transform=transform,
)

In [13]:
df = features.collect()

In [15]:
df.shape

(100, 200)

In [14]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=10,
)

selector.fit(
    X=df.to_numpy(),
    y=target.collect().to_series().to_numpy(),
)

selection = [
    df.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['var_1 from train.csv',
 'var_4 from train.csv',
 'var_141 from train.csv',
 'var_188 from train.csv']