Datasets can be found [here](https://drive.google.com/file/d/1af_Gdt97v-a4KdEhs2NjamWA_kqIODTH/view?usp=sharing).

In [None]:
from enum import Enum
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
from fedot.core.repository.tasks import Task, TaskTypesEnum
from lightgbm import LGBMClassifier, LGBMRegressor
from fedot import Fedot
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, f1_score

warnings.filterwarnings("ignore")

class Dataset(Enum):
    AIRLINES = "airlines_train_regression_10M"
    COVERTYPE = "covtype-normalized"
    KDDCUP = "kddcup"
    POLICE_INCIDENTS = "sf-police-incidents"
    SUSY = "SUSY"
    YEAR_PREDICTION_MSD = "year_prediction_msd"


dataset_meta = {
    Dataset.AIRLINES: {
        "task_type": TaskTypesEnum.regression,
        "target_name": "DepDelay",
        "file_name": "airlines_train_regression_10M.csv"
    },
    Dataset.COVERTYPE: {
        "task_type": TaskTypesEnum.classification,
        "target_name": "class",
        "file_name": "covtype-normalized.csv"
    },
    Dataset.KDDCUP: {
        "task_type": TaskTypesEnum.classification,
        "target_name": "label",
        "file_name": "kddcup.csv"
    },
    Dataset.POLICE_INCIDENTS: {
        "task_type": TaskTypesEnum.classification,
        "target_name": "ViolentCrime",
        "file_name": "sf-police-incidents.csv"
    },
    Dataset.SUSY: {
        "task_type": TaskTypesEnum.classification,
        "target_name": 0,
        "file_name": "SUSY.csv"
    },
    Dataset.YEAR_PREDICTION_MSD: {
        "task_type": TaskTypesEnum.regression,
        "target_name": "year",
        "file_name": "year_prediction_msd.csv"
    },
}

In [None]:
def load_dataset(dataset: Dataset, test_size=0.2):
    df = pd.read_csv(f"datasets/{dataset_meta[dataset]['file_name']}")

    categorical_features = df.select_dtypes(include=["object", "category"]).columns
    for col in categorical_features:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    target_name = dataset_meta[dataset]['target_name']
    if isinstance(target_name, int):
        y = df.iloc[:, target_name]
        x = df.drop(df.columns[target_name], axis=1)
    else:
        y = df[target_name]
        x = df.drop(target_name, axis=1)
    if dataset == Dataset.COVERTYPE:
        y = y.astype(float).astype(int) - 1
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
    return x_train, x_test, y_train, y_test

In [None]:
def print_metrics(task: TaskTypesEnum, model: Fedot, x_test, y_test):
    y_pred = model.predict(x_test)
    if task == TaskTypesEnum.regression:
        print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
        print("Test MAE:", mean_absolute_error(y_test, y_pred))
        print("Test R2:", r2_score(y_test, y_pred))
    else:
        print("Test Accuracy:", accuracy_score(y_test, y_pred))
        print("Test F1 weighted:", f1_score(y_test, y_pred, average='weighted'))
        print("Test F1 macro:", f1_score(y_test, y_pred, average='macro'))

In [None]:
def train_fedot(dataset: Dataset, timeout=60, test_size=0.2):
    x_train, x_test, y_train, y_test = load_dataset(dataset, test_size=test_size)

    model = Fedot(
        problem=dataset_meta[dataset]['task_type'].value,
        timeout=timeout,
        n_jobs=-1,
        logging_level=20,
        metric='f1' if dataset_meta[dataset]['task_type'] == TaskTypesEnum.classification else 'rmse',
    )
    pipeline = model.fit(features=x_train, target=y_train)
    print_metrics(dataset_meta[dataset]['task_type'], model, x_test, y_test)

In [None]:
def get_baseline(x_train, x_test, y_train, y_test, task):
    if task.value == "classification":
        model = LGBMClassifier(
            n_estimators=200,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1,
        )
        model.fit(x_train, y_train)
    elif task.value == "regression":
        model = LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1
        )
        model.fit(x_train, y_train)

    print_metrics(task, model, x_test, y_test)

# LGBM baselines

In [None]:
dataset = Dataset.AIRLINES
x_train, x_test, y_train, y_test = load_dataset(dataset)
get_baseline(x_train, x_test, y_train, y_test, dataset_meta[dataset]['task_type'])

Test RMSE: 28.747375173926837
Test MAE: 13.868159190071447
Test R2: 0.0439195745612182


In [None]:
dataset = Dataset.KDDCUP
x_train, x_test, y_train, y_test = load_dataset(dataset)
get_baseline(x_train, x_test, y_train, y_test, dataset_meta[dataset]['task_type'])

Test Accuracy: 0.988065576046227
Test F1 weighted: 0.9852725724250796
Test F1 macro: 0.16123454275795931


In [None]:
dataset = Dataset.SUSY
x_train, x_test, y_train, y_test = load_dataset(dataset)
get_baseline(x_train, x_test, y_train, y_test, dataset_meta[dataset]['task_type'])

Test Accuracy: 0.801902
Test F1 weighted: 0.8001104593044638
Test F1 macro: 0.7976378229414733


In [None]:
dataset = Dataset.COVERTYPE
x_train, x_test, y_train, y_test = load_dataset(dataset)
get_baseline(x_train, x_test, y_train, y_test, dataset_meta[dataset]['task_type'])

Test Accuracy: 0.8585750798172164
Test F1 weighted: 0.8578788224921586
Test F1 macro: 0.8465977709563939


In [None]:
dataset = Dataset.POLICE_INCIDENTS
x_train, x_test, y_train, y_test = load_dataset(dataset)
get_baseline(x_train, x_test, y_train, y_test, dataset_meta[dataset]['task_type'])

Test Accuracy: 0.8783354589677317
Test F1 weighted: 0.8214434485563414
Test F1 macro: 0.467613734689561


In [None]:
dataset = Dataset.YEAR_PREDICTION_MSD
x_train, x_test, y_train, y_test = load_dataset(dataset)
get_baseline(x_train, x_test, y_train, y_test, dataset_meta[dataset]['task_type'])

Test RMSE: 9.007852850561541
Test MAE: 6.2906331003428315
Test R2: 0.3182254394976818


# FEDOT random small% baseline

In [None]:
train_fedot(Dataset.COVERTYPE, timeout=18, test_size=0.7)

2025-11-07 18:05:19,878 - TableTypesCorrector - Preprocessing was unable to define the categorical columns
2025-11-07 18:05:33,597 - AssumptionsHandler - Initial pipeline fitting started
2025-11-07 18:06:21,980 - AssumptionsHandler - Initial pipeline was fitted successfully
2025-11-07 18:06:21,982 - AssumptionsHandler - Memory consumption for fitting of the initial pipeline in main session: current 438.0 MiB, max: 691.5 MiB
2025-11-07 18:06:22,031 - ApiComposer - Initial pipeline was fitted in 51.8 sec.
2025-11-07 18:06:22,033 - ApiComposer - Taking into account n_folds=5, estimated fit time for initial assumption is 258.8 sec.
2025-11-07 18:06:22,035 - AssumptionsHandler - Preset was changed to fast_train due to fit time estimation for initial model.
2025-11-07 18:06:22,055 - ApiComposer - AutoML configured. Parameters tuning: True. Time limit: 15 min. Set of candidate models: ['knn', 'logit', 'normalization', 'pca', 'rf', 'scaling'].
2025-11-07 18:06:22,181 - ApiComposer - Timeout is

In [None]:
train_fedot(Dataset.AIRLINES, timeout=12, test_size=0.97)

2025-11-07 18:23:48,462 - TableTypesCorrector - Preprocessing defines the following columns as categorical: [0 2]
2025-11-07 18:23:50,892 - AssumptionsHandler - Initial pipeline fitting started
2025-11-07 18:26:02,322 - fedot.core.caching.base_cache - Nodes can not be saved
Traceback (most recent call last):
  File "/home/artem/PycharmProjects/FEDOT_fork/fedot/core/caching/operations_cache.py", line 36, in save_nodes
    self._db.add_operations(mapped)
  File "/home/artem/PycharmProjects/FEDOT_fork/fedot/core/caching/operations_cache_db.py", line 97, in add_operations
    cur.executemany(f'INSERT OR IGNORE INTO {self._main_table} VALUES (?, ?);', pickled)
sqlite3.InterfaceError: Error binding parameter 1 - probably unsupported type.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/artem/PycharmProjects/FEDOT_fork/.venv/lib/python3.10/site-packages/golem/core/log.py", line 196, in log_or_raise
    raise exc from rece