## Description of the task and dataset

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

#input data initialization
train_data_path = '../cases/data/scoring/scoring_train.csv'
test_data_path = '../cases/data/scoring/scoring_test.csv'
df = pd.read_csv(train_data_path)
df.head(10)

Unnamed: 0,ID,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30.59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60.89DaysPastDueNotWorse,NumberOfDependents,target
0,0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,1
1,1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0
2,2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,0
3,3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,0
4,4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,0
5,5,0.213179,74,0,0.375607,3500.0,3,0,1,0,1.0,0
6,6,0.305682,57,0,5710.0,,8,0,3,0,0.0,0
7,7,0.754464,39,0,0.20994,3500.0,8,0,0,0,0.0,0
8,8,0.116951,27,0,46.0,,2,0,0,0,,0
9,9,0.189169,57,0,0.606291,23684.0,9,0,4,0,2.0,0


In [2]:
## Baseline model

from fedot.api.main import Fedot

#task selection, initialisation of the framework
baseline_model = Fedot(problem='classification')

#fit model without optimisation - single XGBoost node is used 
baseline_model.fit(features=train_data_path, target='target', predefined_model='xgboost')

#evaluate the prediction with test data
baseline_model.predict_proba(features=test_data_path)

#evaluate quality metric for the test sample
baseline_metrics = baseline_model.get_metrics()
print(baseline_metrics)

Fit chain from scratch
{'roc_auc': 0.823, 'f1': 0.23837209302325582}


## FEDOT AutoML for classification

In [3]:
# new instance to be used as AutoML tool
auto_model = Fedot(problem='classification', seed = 42, verbose_level=4)

In [4]:
#run of the AutoML-based model generation
pipeline = auto_model.fit(features=train_data_path, target='target')

Composition started. Parameters tuning: True. Set of candidate models: ['logit', 'lda', 'qda', 'dt', 'rf', 'knn', 'xgboost', 'bernb', 'scaling', 'normalization', 'simple_imputation', 'pca', 'kernel_pca', 'poly_features', 'one_hot_encoding', 'rfe_lin_class', 'rfe_non_lin_class']. Composing time limit: 0:02:00
Model composition started
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]
Chain (/n_scaling_default_params;)/n_xgboost_default_params with metrics: [-0.806]


In [5]:
prediction = auto_model.predict_proba(features=test_data_path)
auto_metrics = auto_model.get_metrics()
print(auto_metrics)

{'roc_auc': 0.855, 'f1': 0.16883116883116883}


In [6]:
#comparison with the manual pipeline

print('Baseline', round(baseline_metrics['roc_auc'], 3))
print('AutoML solution', round(auto_metrics['roc_auc'], 3))

Baseline 0.823
AutoML solution 0.855
