In [1]:
from autofeat.dataset import *

source_dataset = KaggleDataset(id="mahmoudemadabdallah/hr-analytics-employee-attrition-and-performance")

In [2]:
from autofeat.transform import *

transform = (
    Cast()
    .then(Encode())
    .then(Aggregate(by={"EmployeeID"}))
    .then(Identity(), Join(on={"EmployeeID"}))
    .then(Identity(), Combine())
)

In [3]:
derived_dataset = source_dataset.derive(transform)

In [4]:
import numpy
import sklearn.feature_selection
import xgboost

from autofeat.analysis import extract_features

examples = [
    (
        Filter(eq={"EmployeeID": "51A4-EA6E"}),
        True,
    ),
    (
        Filter(eq={"EmployeeID": "A4A4-F49C"}),
        False,
    ),
]

features = extract_features(
    derived_dataset, 
    [filter for filter, _ in examples],
)

df = features.collect()

selector = sklearn.feature_selection.SelectFromModel(
    xgboost.XGBClassifier(missing=float("inf")),
    max_features=10,
)

selector.fit(
    X=df.to_numpy(),
    y=numpy.array([label for _, label in examples]),
)

selection = [
    df.columns[i]
    for i, selected in enumerate(selector.get_support())
    if selected
]

In [5]:
selection


['count(*) from RatingLevel.csv',
 'max(RatingID) from RatingLevel.csv',
 'mean(RatingID) from RatingLevel.csv',
 'median(RatingID) from RatingLevel.csv',
 'min(RatingID) from RatingLevel.csv',
 'std(RatingID) from RatingLevel.csv',
 'sum(RatingID) from RatingLevel.csv',
 'var(RatingID) from RatingLevel.csv',
 'count(*) from PerformanceRating.csv',
 'max(EnvironmentSatisfaction) from PerformanceRating.csv']