In [1]:
import polars

polars.enable_string_cache()

In [2]:
from autofeat import source

dataset = source.from_kaggle(
    name="house-prices-advanced-regression-techniques",
)


In [3]:
from autofeat.transform import Cast, Encode, Require, Select

training_dataset = dataset.apply(
    Require(lambda table: table.name == "train.csv")
    .then(Select(exclude=["SalePrice"]))
    .then(Cast())
    .then(Encode()),
)

In [4]:
filters = (
    dataset.table("train.csv")
    .column("Id")
)

target = (
    dataset.table("train.csv")
    .column("SalePrice")
)

In [5]:
features = training_dataset.features(filters=filters)

In [6]:
df = features.collect()

In [7]:
df.shape

(1460, 149)

In [8]:
import sklearn.feature_selection
import xgboost

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBRegressor(missing=float("inf")),
    max_features=None,
)

selector.fit(
    X=df.to_numpy(),
    y=target.data.collect().to_series().to_numpy(),
)

selection = [
    df.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

selection

['OverallQual from train.csv',
 'TotalBsmtSF from train.csv',
 '2ndFlrSF from train.csv',
 'GrLivArea from train.csv',
 'FullBath from train.csv',
 'KitchenAbvGr from train.csv',
 'GarageCars from train.csv',
 'MSZoning == RL from train.csv',
 'MSZoning == RM from train.csv',
 'LandContour == Bnk from train.csv',
 'ExterQual == TA from train.csv',
 'BsmtQual == Ex from train.csv',
 'CentralAir == Y from train.csv',
 'KitchenQual == Gd from train.csv',
 'KitchenQual == TA from train.csv',
 'Functional == Typ from train.csv',
 'GarageType == Detchd from train.csv',
 'SaleType == WD from train.csv']