In [1]:
from auto_ml_regression.automl_pipeline import AutoMLPipeline
import pandas as pd

data = pd.read_csv('../../datasets/insurance_dataset/insurance.csv')
target_col="charges"
N_SEGMENTS = 2

cat_features = data.select_dtypes(include='object').columns.to_list()
num_features = data.select_dtypes(exclude='object').columns.to_list()
num_features.remove(target_col)

In [2]:
# Инициализация
pipeline = AutoMLPipeline(
    num_features=num_features,
    cat_features=cat_features,
    target_col=target_col,
    n_segments=N_SEGMENTS   # Можно указать любое количество сегментов
)

In [3]:
# Запуск полного пайплайна
pipeline.preprocess_data(data)

In [4]:
pipeline.segment_data(use_optuna=True, n_trials=30, plot=False)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-05-17 17:09:44,068] A new study created in memory with name: no-name-c4dcd7bc-9462-4661-8105-9836b2eae383
[I 2025-05-17 17:09:46,517] Trial 0 finished with value: -11345.134452908855 and parameters: {'depth': 4, 'learning_rate': 0.09375768624750647, 'l2_leaf_reg': 5.916847090516876}. Best is trial 0 with value: -11345.134452908855.
[I 2025-05-17 17:09:47,124] Trial 1 finished with value: -10128.89097909646 and parameters: {'depth': 4, 'learning_rate': 0.27228782223260956, 'l2_leaf_reg': 6.722108719139823}. Best is trial 1 with value: -10128.89097909646.
[I 2025-05-17 17:09:47,767] Trial 2 finished with value: -11156.685922827615 and parameters: {'depth': 2, 'learning_rate': 0.1345505567823697, 'l2_leaf_reg': 8.441486401382132}. Best is trial 1 with value: -10128.89097909646.
[I 2025-05-17 17:09:47,797] Trial 3 finished with value: -10493.935851298662 and parameters: {'depth': 2, 'learning_rate': 0.2273679037617584, 'l2_leaf_reg'

In [5]:
pipeline.train_classifier(n_trials=10)

[I 2025-05-17 17:09:48,414] A new study created in memory with name: no-name-6b6aa6b8-ab0f-4167-8e4c-d2369c8ad531
[I 2025-05-17 17:09:49,006] Trial 0 finished with value: 1.0 and parameters: {'iterations': 826, 'learning_rate': 0.06334905445668514, 'l2_leaf_reg': 0.2945242351729893, 'random_strength': 1.0341108441998181, 'bagging_temperature': 9.088792252493082, 'border_count': 78, 'min_data_in_leaf': 58, 'leaf_estimation_iterations': 4, 'grow_policy': 'SymmetricTree', 'od_type': 'IncToDec', 'od_wait': 31, 'one_hot_max_size': 3}. Best is trial 0 with value: 1.0.
[I 2025-05-17 17:09:58,639] Trial 1 finished with value: 1.0 and parameters: {'iterations': 1489, 'learning_rate': 0.015644245329886102, 'l2_leaf_reg': 0.0025371680270307663, 'random_strength': 0.3630166963407439, 'bagging_temperature': 1.1460520629508353, 'border_count': 173, 'min_data_in_leaf': 1, 'leaf_estimation_iterations': 13, 'grow_policy': 'Depthwise', 'od_type': 'IncToDec', 'od_wait': 32, 'one_hot_max_size': 16, 'max_d

Classification Report: TRAIN
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       450
           1       1.00      1.00      1.00       446

    accuracy                           1.00       896
   macro avg       1.00      1.00      1.00       896
weighted avg       1.00      1.00      1.00       896

ROC AUC = 1.0
TEST ROC AUC = 1.0000


In [6]:
pipeline.train_regressors(model_type='huber', n_trials=50, cv=4)

[I 2025-05-17 17:10:01,391] A new study created in memory with name: no-name-23deca2b-edfb-43d1-ace2-fd8f213805d0
[I 2025-05-17 17:10:01,412] Trial 0 finished with value: -11448.39785436738 and parameters: {'epsilon': 5.675563982436878, 'alpha': 0.22690236165035968, 'fit_intercept': False}. Best is trial 0 with value: -11448.39785436738.
[I 2025-05-17 17:10:01,433] Trial 1 finished with value: -11023.60413122239 and parameters: {'epsilon': 8.172316251267924, 'alpha': 0.14860258876518653, 'fit_intercept': False}. Best is trial 1 with value: -11023.60413122239.
[I 2025-05-17 17:10:01,453] Trial 2 finished with value: -6982.680656449957 and parameters: {'epsilon': 6.175055964741765, 'alpha': 0.04684874424276271, 'fit_intercept': True}. Best is trial 2 with value: -6982.680656449957.
[I 2025-05-17 17:10:01,473] Trial 3 finished with value: -7355.939639724247 and parameters: {'epsilon': 8.434015360719204, 'alpha': 0.12864010783810992, 'fit_intercept': True}. Best is trial 2 with value: -698

In [7]:
pipeline.regressor.get_model_coefs()

Unnamed: 0,model_coef_segment_0,model_coef_segment_1
intercept_,0.0,0.0
age,12142.247153,12747.927445
bmi,8351.40332,3551.157631
children,1769.444153,4254.480173
sex,-45.945171,-525.683936
smoker,14715.677965,32628.040362
region,-719.451842,-66.207703


In [8]:
test_target_preds = pipeline.regressor.predict(pipeline.segmented_test[num_features+cat_features], 
                              segments=pipeline.segmented_test['segment_preds'],
                              )
test_target_true = pipeline.segmented_test.charges

In [9]:
from sklearn import metrics
r2 = metrics.r2_score(test_target_true, test_target_preds)
mse = metrics.mean_squared_error(test_target_true, test_target_preds)
mae = metrics.mean_absolute_error(test_target_true, test_target_preds)
mape = metrics.mean_absolute_percentage_error(test_target_true, test_target_preds)

print(f"R2 = {r2}")
print(f"MSE = {mse}")
print(f"RMSE = {mse**0.5}")
print(f"MAE = {mae}")
print(f"MAPE = {mape}")

R2 = 0.8513539575621136
MSE = 22248942.00827885
RMSE = 4716.87841779697
MAE = 2596.4377537090395
MAPE = 0.246662173987762


In [10]:
# Оценка качества
metrics_df = pipeline.evaluate_test()
metrics_df

Unnamed: 0,r2,mse,rmse,mae,mape
0,0.851354,22248940.0,4716.878418,2596.437754,0.246662
