In [7]:
import polars

polars.enable_string_cache()

In [8]:
from autofeat.dataset import KaggleDataset
from autofeat.transform import Cast, Combine, Encode, Identity, Require, Select

dataset = KaggleDataset(
    name="house-prices-advanced-regression-techniques",
)

transform = (
    Require(lambda table: table.name == "train.csv")
    .then(Select(exclude=["SalePrice"]))
    .then(Cast())
    .then(Encode())
    .then(Identity(), Combine())
)

filters = dataset.table("train.csv").column("Id")

labels = dataset.table("train.csv").column("SalePrice")

In [9]:
from autofeat.analysis import extract_features

features = extract_features(
    dataset.apply(transform),
    filters,
)

In [10]:
df = features.collect()

In [17]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=10,
)

selector.fit(
    X=df.to_numpy(),
    y=labels.data.collect().to_series().to_numpy(),
)

selection = [
    df.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['OverallQual from train.csv',
 'TotalBsmtSF from train.csv',
 '2ndFlrSF from train.csv',
 'GrLivArea from train.csv',
 'KitchenAbvGr from train.csv',
 'GarageCars from train.csv',
 'MSZoning == RM from train.csv',
 'BsmtQual == Ex from train.csv',
 'CentralAir == Y from train.csv',
 'GarageType == Detchd from train.csv']