In [1]:
import polars

polars.enable_string_cache()

In [2]:
from autofeat import source

dataset = source.from_kaggle(
    name="house-prices-advanced-regression-techniques",
)


In [3]:
given = (
    dataset
    .table("train.csv")
    .data
    .select("Id")
    .collect()
)

target = (
    dataset
    .table("train.csv")
    .data
    .select("SalePrice")
    .collect()
    .to_series()
)

In [4]:
from autofeat.transform import Cast, Combine, Drop, Encode, Identity, Require

transform = (
    Require(lambda table: table.name == "train.csv")
    .then(Drop(columns={"SalePrice"}))
    .then(Cast())
    .then(Identity(), Encode(), Combine())
)

features = dataset.apply(transform)

In [5]:
df = features.extract(given=given)

In [7]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=10,
)

selector.fit(
    X=df.to_numpy(),
    y=target.to_numpy(),
)

selection = [
    df.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['OverallQual from cast(train.csv)',
 'GarageCars from cast(train.csv)',
 'BsmtQual == Ex from encode(cast(train.csv))',
 'CentralAir == N from encode(cast(train.csv))',
 'KitchenQual == TA from encode(cast(train.csv))',
 'BsmtFinSF1 * GrLivArea from combine(cast(train.csv))',
 'TotalBsmtSF + GrLivArea from combine(cast(train.csv))',
 'TotalBsmtSF * GrLivArea from combine(cast(train.csv))',
 '1stFlrSF * GrLivArea from combine(cast(train.csv))',
 'GrLivArea + GarageArea from combine(cast(train.csv))']