In [1]:
from autofeat import source

dataset = source.from_kaggle(
    name="santander-customer-transaction-prediction",
)


In [4]:
where = (
    dataset
    .table("train.csv")
    .data
    .select("ID_code")
    .collect()
)

target = (
    dataset
    .table("train.csv")
    .data
    .select("target")
    .collect()
    .to_series()
)

In [2]:
from autofeat.transform import Cast, Encode, Identity, Require, Select

transform = (
    Require(lambda table: table.name == "train.csv")
    .then(Select(exclude={"target"}))
    .then(Cast())
    .then(Identity(), Encode())
)

train = dataset.apply(transform)

In [5]:
features = train.features(where=where)

In [6]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=10,
)

selector.fit(
    X=features.to_numpy(),
    y=target.to_numpy(),
)

selection = [
    features.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['var_53 from train.csv',
 'var_81 from train.csv',
 'var_6 from train.csv',
 'var_109 from train.csv',
 'var_174 from train.csv',
 'var_146 from train.csv',
 'var_80 from train.csv',
 'var_12 from train.csv',
 'var_110 from train.csv',
 'var_139 from train.csv']