In [1]:
import polars

polars.enable_string_cache()

In [2]:
from autofeat import source

dataset = source.from_kaggle(
    name="house-prices-advanced-regression-techniques",
)

In [3]:
from autofeat.transform import Cast, Encode, Require, Select

transform = (
    Require(lambda table: table.name == "train.csv")
    .then(Select(exclude=["SalePrice"]))
    .then(Cast())
    .then(Encode())
)

In [4]:
filters = (
    dataset.table("train.csv")
    .column("Id")
)

target = (
    dataset.table("train.csv")
    .column("SalePrice")
)

In [5]:
features = dataset.features(
    filters=filters,
    transform=transform,
)

In [6]:
df = features.collect()

In [7]:
df.shape

(1460, 37)

In [9]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=None,
)

selector.fit(
    X=df.to_numpy(),
    y=target.data.collect().to_series().to_numpy(),
)

selection = [
    df.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['OverallQual from train.csv',
 'GrLivArea from train.csv',
 'FullBath from train.csv',
 'KitchenAbvGr from train.csv',
 'GarageCars from train.csv']