In [12]:
import pandas as pd
from fedot.api.main import Fedot
from fedot.core.pipelines.pipeline_builder import PipelineBuilder

In [13]:
path = "../datasets-offline/playground-series-s3e23"

train = pd.read_csv(f'{path}/train.csv')
test = pd.read_csv(f'{path}/test.csv')
sub = pd.read_csv(f'{path}/sample_submission.csv')

In [14]:
train.drop(columns=["id"], inplace=True)
test.drop(columns=["id"], inplace=True)

In [15]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, train_size=0.85, random_state=42)

In [16]:
auto_model = Fedot(
    problem="classification",
    metric=["precision", "accuracy", "roc_auc"],
    preset="best_quality",
    with_tuning=True,
    timeout=5,
    cv_folds=10,
    seed=42,
    n_jobs=4,
    logging_level=10,
    initial_assumption=PipelineBuilder()
    .add_node(
        "xgboost",
    )
    .build(),
    use_pipelines_cache=False,
    use_auto_preprocessing=False
)

In [17]:
auto_model.fit(features=train, target="defects")

2024-06-24 12:39:15,124 - AssumptionsHandler - Initial pipeline fitting started
2024-06-24 12:39:15,136 - PipelineNode - Trying to fit pipeline node with operation: xgboost
2024-06-24 12:39:16,956 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost
2024-06-24 12:39:17,003 - AssumptionsHandler - Initial pipeline was fitted successfully
2024-06-24 12:39:17,006 - AssumptionsHandler - Memory consumption for fitting of the initial pipeline in main session: current 43.4 MiB, max: 111.7 MiB
2024-06-24 12:39:17,008 - ApiComposer - Initial pipeline was fitted in 3.8 sec.
2024-06-24 12:39:17,019 - ApiComposer - AutoML configured. Parameters tuning: True. Time limit: 5 min. Set of candidate models: ['bernb', 'catboost', 'dt', 'fast_ica', 'isolation_forest_class', 'knn', 'lgbm', 'logit', 'mlp', 'normalization', 'pca', 'poly_features', 'qda', 'resample', 'rf', 'scaling'].
2024-06-24 12:39:17,043 - type - Random State: random.getstate() follows...
(3, (2147483648, 3564348608, 1

Generations:   0%|          | 0/10000 [00:00<?, ?gen/s]

2024-06-24 12:39:17,070 - MultiprocessingDispatcher - Number of used CPU's: 4
2024-06-24 12:39:33,426 - Unknown integration target: 
2024-06-24 12:39:36,499 - Unknown integration target: 
2024-06-24 12:39:39,630 - Unknown integration target: 
2024-06-24 12:39:42,698 - Unknown integration target: 
2024-06-24 12:39:45,913 - Unknown integration target: 
2024-06-24 12:39:52,744 - Unknown integration target: 
2024-06-24 12:39:59,607 - Unknown integration target: 
2024-06-24 12:40:06,659 - Unknown integration target: 
2024-06-24 12:40:07,294 - Unknown integration target: 
2024-06-24 12:40:07,294 - Unknown integration target: 
2024-06-24 12:40:29,001 - MultiprocessingDispatcher - 1 individuals out of 1 in previous population were evaluated successfully.
2024-06-24 12:40:29,011 - EvoGraphOptimizer - Generation num: 1 size: 1
2024-06-24 12:40:29,014 - EvoGraphOptimizer - Best individuals: ParetoFront archive fitness (1): ['<precision=-0.638 accuracy=-0.811 roc_auc=-0.783>']
2024-06-24 12:40:29,

Generations:   0%|          | 0/10000 [03:06<?, ?gen/s]

2024-06-24 12:42:23,219 - OptimisationTimer - Composition time: 3.103 min
2024-06-24 12:42:23,222 - OptimisationTimer - Algorithm was terminated due to processing time limit
2024-06-24 12:42:23,235 - EvoGraphOptimizer - Generation num: 3 size: 4
2024-06-24 12:42:23,239 - EvoGraphOptimizer - Best individuals: ParetoFront archive fitness (4): ['<precision=-0.725 accuracy=-0.803 roc_auc=-0.788>', '<precision=-0.703 accuracy=-0.809 roc_auc=-0.792>', '<precision=-0.656 accuracy=-0.814 roc_auc=-0.790>', '<precision=-0.649 accuracy=-0.813 roc_auc=-0.790>']
2024-06-24 12:42:23,240 - EvoGraphOptimizer - no improvements for 1 iterations
2024-06-24 12:42:23,243 - EvoGraphOptimizer - spent time: 3.1 min
2024-06-24 12:42:23,250 - GPComposer - GP composition finished
2024-06-24 12:42:23,258 - DataSourceSplitter - K-folds cross validation is applied.
2024-06-24 12:42:23,261 - ApiComposer - Hyperparameters tuning started with 2 min. timeout
2024-06-24 12:42:23,265 - SimultaneousTuner - Hyperparameters




2024-06-24 12:42:25,243 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost
2024-06-24 12:42:25,302 - PipelineNode - Trying to fit pipeline node with operation: xgboost
2024-06-24 12:42:27,235 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost
2024-06-24 12:42:27,295 - PipelineNode - Trying to fit pipeline node with operation: xgboost
2024-06-24 12:42:29,218 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost
2024-06-24 12:42:29,265 - PipelineNode - Trying to fit pipeline node with operation: xgboost
2024-06-24 12:42:31,175 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost
2024-06-24 12:42:31,222 - PipelineNode - Trying to fit pipeline node with operation: xgboost
2024-06-24 12:42:33,128 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost
2024-06-24 12:42:33,174 - PipelineNode - Trying to fit pipeline node with operation: xgboost
2024-06-24 12:42:35,093 - PipelineNode - Obta

{'depth': 1, 'length': 1, 'nodes': [xgboost]}

In [18]:
prediction = auto_model.predict_proba(features=test)

2024-06-24 12:44:17,078 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost


In [19]:
sub.Target = prediction.ravel()
sub.to_csv("submission.csv", index=False)

In [20]:
prediction_valid = auto_model.predict_proba(features=valid)

2024-06-24 12:44:17,343 - PipelineNode - Obtain prediction in pipeline node by operation: xgboost


In [22]:
from sklearn.metrics import roc_auc_score

roc_auc_score(valid.defects, prediction_valid)

0.7846421211975648