In [1]:
import statistics
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold

import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../data/wine.csv")

In [3]:
df['type'] = LabelEncoder().fit_transform(df['type'])

In [4]:
y = df.pop('type')
X = df

Define some utility functions and the default cross validation and scoring functions to be used:

In [5]:
from collections import defaultdict

def print_scores(results, sorted_by=None, show_timings=False):
    if sorted_by:
        results = sorted(results, key=lambda x: statistics.mean(x[1][sorted_by]), reverse=True)

    for model, v in results:
        print(f"{model}:")
        for metric, values in v.items():
            if not show_timings and not metric.startswith("test_"): continue
            print(f"{metric}: {statistics.mean(values):.6f}")

def scores_to_csv(results, round_to = 5):   
    scores = defaultdict(list)
    for model, v in results:
        scores["model"].append(model)
        for metric, values in v.items():
            if not metric.startswith("test_"): continue
            scores[metric].append(round(statistics.mean(values), round_to))
    return pd.DataFrame(data=scores).to_csv(index=False)

In [6]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scoring = ['f1_weighted', 'roc_auc_ovr_weighted','balanced_accuracy', 'matthews_corrcoef']

Basic Decision tree classifier:

In [7]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=0)

scores = cross_validate(model, X, y, scoring=scoring, n_jobs=-1, cv=cv)
print_scores([(model.__class__.__name__, scores)])

DecisionTreeClassifier:
test_f1_weighted: 0.986412
test_roc_auc_ovr_weighted: 0.982428
test_balanced_accuracy: 0.982137
test_matthews_corrcoef: 0.963454


In [24]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

classifiers = [
    LogisticRegression(random_state=0, multi_class='ovr'),
    DecisionTreeClassifier(random_state=0),
    GaussianNB(),
    KNeighborsClassifier(),
    MLPClassifier(random_state=0),
    LinearDiscriminantAnalysis(),
    SVC(random_state=0, kernel='linear',probability=True),
    RandomForestClassifier(random_state=0),
    ExtraTreesClassifier(random_state=0)
]

results = []
for model in classifiers:
    model_name = model.__class__.__name__
    scores = cross_validate(model, X, y, scoring=scoring, n_jobs=-1, cv=cv)
    results.append([model_name, scores])

#print_scores(results, 'test_f1_weighted')
csv = scores_to_csv(results)
print(csv)

TypeError: GradientBoostingClassifier.__init__() got an unexpected keyword argument 'l2_regularization'

In [25]:
%%time
#from autosklearn.experimental.askl2 import AutoSklearn2Classifier
import autosklearn.classification
from autosklearn.metrics import f1_weighted, log_loss
from sklearn.metrics import roc_auc_score, matthews_corrcoef

def roc_auc_ovr_weighted(y_true, y_score):
    return roc_auc_score(y_true, y_score, average="weighted", multi_class="ovr")

rocauc = autosklearn.metrics.make_scorer('roc_auc_ovr_weighted', roc_auc_ovr_weighted)                                                                                            

mcc = autosklearn.metrics.make_scorer('matthews_corrcoef', matthews_corrcoef)

#automl = autosklearn.classification.AutoSklearnClassifier(
#    metric=mcc,
#    time_left_for_this_task=240,
#    per_run_time_limit=30,
#    n_jobs=-1,
    #n_jobs=16,
    #memory_limit=2048,
#    tmp_folder="/tmp/autosklearn_resampling_example_tmp",
#    disable_evaluator_output=False,
#    resampling_strategy=cv,
    #scoring_functions=[f1_weighted, rocauc, log_loss]
#)

import autosklearn.classification
import sklearn.metrics

#scorer = autosklearn.metrics.make_scorer('f1_score', sklearn.metrics.f1_score, pos_label=0)

automl = autosklearn.classification.AutoSklearnClassifier(
    metric=mcc,
    time_left_for_this_task=240,
    per_run_time_limit=30,
    n_jobs=-1,
    #memory_limit=2048,
    tmp_folder="/tmp/autosklearn_resampling_example_tmp",
    disable_evaluator_output=False,
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds": 10},
)

#automl = AutoSklearn2Classifier(resampling_strategy=cv)

automl.fit(X, y, dataset_name="wine")

print(automl.sprint_statistics())
print(automl.leaderboard())


auto-sklearn results:
  Dataset name: wine
  Metric: matthews_corrcoef
  Best validation score: 0.991713
  Number of target algorithm runs: 152
  Number of successful target algorithm runs: 123
  Number of crashed target algorithm runs: 11
  Number of target algorithms that exceeded the time limit: 18
  Number of target algorithms that exceeded the memory limit: 0

          rank  ensemble_weight               type      cost   duration
model_id                                                               
144          1             0.20  gradient_boosting  0.008287   6.761546
133          2             0.06                lda  0.008313  25.652464
132          3             0.02  gradient_boosting  0.009539  10.561324
149          4             0.02  gradient_boosting  0.010365   8.346216
129          5             0.04  gradient_boosting  0.010374   8.027264
20           6             0.02                lda  0.010388   6.140369
51           7             0.02           adaboost  0.01

In [13]:
automl.show_models()[2]

{'model_id': 24,
 'rank': 2,
 'cost': 0.0033861782361089723,
 'ensemble_weight': 0.04,
 'voting_model': VotingClassifier(estimators=None, voting='soft'),
 'estimators': [{'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f12181219f0>,
   'balancing': Balancing(random_state=1, strategy='weighting'),
   'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f118f8f3910>,
   'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7f118f8f27d0>,
   'sklearn_classifier': ExtraTreesClassifier(criterion='entropy', max_features=10, min_samples_leaf=2,
                        min_samples_split=6, n_estimators=512, n_jobs=1,
                        random_state=1, warm_start=True)},
  {'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f11b06bb670>,
   'balancing': Balancing(random_state=1, strategy='weighting')

In [15]:
%%time
from pprint import pprint
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier, HistGradientBoostingClassifier

#v = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#coring = ['f1','precision_macro', 'recall_macro', 'balanced_accuracy']

classifiers = [
    HistGradientBoostingClassifier(early_stopping=True,
                                  l2_regularization=0.0002845813444661551,
                                  learning_rate=0.5763871990263213, max_iter=32,
                                  max_leaf_nodes=313, min_samples_leaf=200,
                                  random_state=1,
                                  validation_fraction=0.0984819561748849,
                                  warm_start=True),
    
    HistGradientBoostingClassifier(early_stopping=True, l2_regularization=1e-10,
                                  learning_rate=0.061183590203803505, loss='auto',
                                  max_iter=512, max_leaf_nodes=14,
                                  min_samples_leaf=31, n_iter_no_change=9,
                                  random_state=1, validation_fraction=None,
                                  warm_start=True),
    
    HistGradientBoostingClassifier(early_stopping=True,
                                  l2_regularization=0.0024034282201116797,
                                  learning_rate=0.49328674977232706, max_iter=128,
                                  max_leaf_nodes=9, min_samples_leaf=95,
                                  n_iter_no_change=2, random_state=1,
                                  validation_fraction=None, warm_start=True),
    
    AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                      learning_rate=0.949552173721954, n_estimators=256,
                      random_state=1),
    
    ExtraTreesClassifier(criterion='entropy', max_features=10, min_samples_leaf=2,
                        min_samples_split=6, n_estimators=512, n_jobs=1,
                        random_state=1, warm_start=True)
]

results = []
for model in classifiers:
    model_name = model.__class__.__name__
    scores = cross_validate(model, X, y, scoring=scoring, n_jobs=-1, cv=cv)
    results.append([model_name, scores])

print_scores(results, 'test_f1_weighted')
csv = scores_to_csv(results)
print(csv)



HistGradientBoostingClassifier:
test_f1_weighted: 0.996353
test_roc_auc_ovr_weighted: 0.998909
test_balanced_accuracy: 0.994075
test_matthews_corrcoef: 0.990185
HistGradientBoostingClassifier:
test_f1_weighted: 0.996042
test_roc_auc_ovr_weighted: 0.998866
test_balanced_accuracy: 0.993309
test_matthews_corrcoef: 0.989353
HistGradientBoostingClassifier:
test_f1_weighted: 0.995943
test_roc_auc_ovr_weighted: 0.998842
test_balanced_accuracy: 0.993801
test_matthews_corrcoef: 0.989083
AdaBoostClassifier:
test_f1_weighted: 0.995888
test_roc_auc_ovr_weighted: 0.998644
test_balanced_accuracy: 0.993348
test_matthews_corrcoef: 0.988937
ExtraTreesClassifier:
test_f1_weighted: 0.995784
test_roc_auc_ovr_weighted: 0.999195
test_balanced_accuracy: 0.992717
test_matthews_corrcoef: 0.988651
model,test_f1_weighted,test_roc_auc_ovr_weighted,test_balanced_accuracy,test_matthews_corrcoef
HistGradientBoostingClassifier,0.99594,0.99884,0.9938,0.98908
HistGradientBoostingClassifier,0.99604,0.99887,0.99331,0.989

In [26]:
pd.DataFrame(automl.cv_results_).to_csv("automl2.csv")

In [17]:
pprint({'balancing:strategy': 'none', 'classifier:__choice__': 'gradient_boosting', 'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'polynomial', 'classifier:gradient_boosting:early_stop': 'train', 'classifier:gradient_boosting:l2_regularization': 2.51153447372028e-10, 'classifier:gradient_boosting:learning_rate': 0.08568989054310866, 'classifier:gradient_boosting:loss': 'auto', 'classifier:gradient_boosting:max_bins': 255, 'classifier:gradient_boosting:max_depth': 'None', 'classifier:gradient_boosting:max_leaf_nodes': 21, 'classifier:gradient_boosting:min_samples_leaf': 77, 'classifier:gradient_boosting:scoring': 'loss', 'classifier:gradient_boosting:tol': 1e-07, 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'median', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'minmax', 'feature_preprocessor:polynomial:degree': 2, 'feature_preprocessor:polynomial:include_bias': 'False', 'feature_preprocessor:polynomial:interaction_only': 'True', 'classifier:gradient_boosting:n_iter_no_change': 1})


{'balancing:strategy': 'none',
 'classifier:__choice__': 'gradient_boosting',
 'classifier:gradient_boosting:early_stop': 'train',
 'classifier:gradient_boosting:l2_regularization': 2.51153447372028e-10,
 'classifier:gradient_boosting:learning_rate': 0.08568989054310866,
 'classifier:gradient_boosting:loss': 'auto',
 'classifier:gradient_boosting:max_bins': 255,
 'classifier:gradient_boosting:max_depth': 'None',
 'classifier:gradient_boosting:max_leaf_nodes': 21,
 'classifier:gradient_boosting:min_samples_leaf': 77,
 'classifier:gradient_boosting:n_iter_no_change': 1,
 'classifier:gradient_boosting:scoring': 'loss',
 'classifier:gradient_boosting:tol': 1e-07,
 'data_preprocessor:__choice__': 'feature_type',
 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'median',
 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'minmax',
 'feature_preprocessor:__choice__': 'polynomial',
 'feature_preprocessor:polynomial:degree': 2,
 'feature_pr