In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
data_path = "airlines_train_regression_10M.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,DepDelay,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,Origin,Dest,Distance
0,2.0,10.0,30.0,5.0,900.0,1152.0,AA,JFK,LAX,2475.0
1,12.0,10.0,4.0,7.0,1300.0,1535.0,AA,LAX,HNL,2556.0
2,8.0,10.0,11.0,7.0,1300.0,1535.0,AA,LAX,HNL,2556.0
3,-1.0,10.0,29.0,4.0,2310.0,613.0,AA,HNL,LAX,2556.0
4,0.0,10.0,3.0,6.0,2035.0,2110.0,AA,OGG,HNL,100.0


In [3]:
y = df["DepDelay"]
X = df.drop("DepDelay", axis=1)
categorical_features = X.select_dtypes(include=["object", "category"]).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))
X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,Origin,Dest,Distance
0,10.0,30.0,5.0,900.0,1152.0,1,184,194,2475.0
1,10.0,4.0,7.0,1300.0,1535.0,1,194,156,2556.0
2,10.0,11.0,7.0,1300.0,1535.0,1,194,156,2556.0
3,10.0,29.0,4.0,2310.0,613.0,1,156,194,2556.0
4,10.0,3.0,6.0,2035.0,2110.0,1,253,156,100.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_mean = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
rmse_with_dummy_model = np.sqrt(mean_squared_error(y_test, y_mean))
print(f"RMSE with dummy model: {rmse_with_dummy_model:.2f}")

RMSE with dummy model: 29.40


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(
    n_estimators=10,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 31.28


In [21]:
X_sub, _, y_sub, _ = train_test_split(X_train, y_train, test_size=0.85, random_state=42)
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_sub, y_sub)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 29.87


### Итоги:

Dummy модель, которая выдает всегда среднее значение target переменной, показывает лучший результат из 3 моделей выше. Датасет слишком большой, чтобы на нем полностью тренироваться с нормальным значением n_estimators, а также чтобы использовать на нем OneHotEncoding. Если тренировать RandomForestRegressor с n_estimators=100 на небольшой части тренировочных данных, то результат получается лучше, чем если тренировать его с n_estimators=10, но на всем тренировочном датасете.

# FEDOT launches

In [5]:
import warnings
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.data.data import InputData
from fedot.core.repository.tasks import Task, TaskTypesEnum
from fedot.core.repository.dataset_types import DataTypesEnum

warnings.filterwarnings("ignore")

In [8]:
train_input = InputData(
    features=X_train.to_numpy(dtype=np.float32),
    target=y_train.to_numpy(dtype=np.float32),
    idx=np.arange(len(X_train)),
    task=Task(TaskTypesEnum.regression),
    data_type=DataTypesEnum.table
)
node_rfr = PipelineNode('rfr')
node_rfr.parameters = {'n_estimators': 10, 'n_jobs': -1, 'random_state': 42}
pipeline = Pipeline(node_rfr, use_input_preprocessing=False)
pipeline.fit(train_input)

OutputData(idx=array([      0,       1,       2, ..., 7999997, 7999998, 7999999]), task=Task(task_type=<TaskTypesEnum.regression: 'regression'>, task_params=None), data_type=<DataTypesEnum.table: 'feature_table'>, features=array([[1.000e+00, 2.400e+01, 6.000e+00, ..., 2.130e+02, 1.020e+02,
        9.800e+01],
       [6.000e+00, 2.300e+01, 4.000e+00, ..., 7.100e+01, 3.330e+02,
        4.870e+02],
       [2.000e+00, 1.400e+01, 4.000e+00, ..., 3.240e+02, 9.200e+01,
        3.810e+02],
       ...,
       [3.000e+00, 2.000e+01, 1.000e+00, ..., 1.660e+02, 2.560e+02,
        5.880e+02],
       [9.000e+00, 1.100e+01, 7.000e+00, ..., 1.920e+02, 2.260e+02,
        2.175e+03],
       [6.000e+00, 8.000e+00, 3.000e+00, ..., 4.400e+01, 1.990e+02,
        1.850e+02]]), categorical_features=None, categorical_idx=None, numerical_idx=None, encoded_idx=None, features_names=None, target=array([-2., -1., 19., ..., -4., -2.,  1.]), supplementary_data=SupplementaryData(is_main_target=True, data_flow_length=0

In [9]:
test_input = InputData(
    features=X_test.to_numpy(dtype=np.float32),
    target=y_test.to_numpy(dtype=np.float32),
    idx=np.arange(len(X_train)),
    task=Task(TaskTypesEnum.regression),
    data_type=DataTypesEnum.table
)
prediction = pipeline.predict(input_data=test_input)
rmse_fedot = np.sqrt(mean_squared_error(test_input.target, prediction.predict))
print(f"RMSE FEDOT: {rmse_fedot:.2f}")

RMSE FEDOT: 31.24


В итоге получили примерно те же результаты за то же время, что и без FEDOT обертки. Если поставить n_estimators=100, то в FEDOT обертке примерно через час работы также возникнет ошибка выделения памяти.