In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

In [2]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [3]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [4]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [5]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [14]:
np.random.seed(42)
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000, multi_class='ovr',random_state=42),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000, multi_class='multinomial',random_state=42),  
    RandomForestClassifier(n_estimators=300, random_state=42),
    GradientBoostingClassifier(n_estimators=200,random_state=42)
], X_train, X_test, y_train, cv)

100%|██████████| 4/4 [02:35<00:00, 38.99s/it]


In [15]:
clf = LogisticRegression(multi_class='auto', solver='lbfgs', penalty="none")
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

0.978797

In [16]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=200,random_state=42),
], X_train, X_test, y_train, cv)

100%|██████████| 2/2 [00:14<00:00,  7.23s/it]


In [17]:
clf = LogisticRegression(multi_class='auto',solver='lbfgs',penalty="none")
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

0.982421

In [18]:
stacked_features_train, stacked_features_test = generate_meta_features([
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cv=cv)


100%|██████████| 2/2 [00:05<00:00,  2.76s/it]


In [19]:
clf = LogisticRegression(multi_class='auto',solver='lbfgs', penalty="none")
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.989904

In [20]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000, multi_class='ovr',random_state=42),
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    AdaBoostClassifier(random_state=42)
], X_train, X_test, y_train, cv=cv)

100%|██████████| 4/4 [00:40<00:00, 10.21s/it]


In [21]:
clf = LogisticRegression(multi_class='auto',solver='lbfgs', penalty="none")
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

0.987404

In [30]:
def compute_meta_feature_strat(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [32]:
def generate_meta_features_strat(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature_strat(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [35]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:14<00:00,  7.28s/it]


In [36]:
clf = LogisticRegression(multi_class='auto',solver='lbfgs', penalty="none")
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

0.983918

In [37]:
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
], X_train, X_test, y_train, cv=cv)

clf = LogisticRegression(multi_class='auto',solver='lbfgs', penalty="none")
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:30<00:00, 15.47s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.984228

In [38]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
], X_train, X_test, y_train, cv=cv)

clf = RandomForestClassifier(random_state=42)
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:07<00:00,  3.99s/it]


0.981661

In [39]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
], X_train, X_test, y_train, cv=cv)

clf = KNeighborsClassifier()
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:08<00:00,  4.08s/it]


0.98417

In [40]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
], X_train, X_test, y_train, cv=cv)

clf = GradientBoostingClassifier(random_state=42)
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:07<00:00,  3.82s/it]


0.984925

In [41]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

stacked_features_train, stacked_features_test = generate_meta_features_strat([
    RandomForestClassifier(n_estimators=300, random_state=42, max_depth=24, criterion="gini"),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
], X_train, X_test, y_train, cv=cv)

clf = ExtraTreesClassifier(random_state=42, n_estimators=100, n_jobs=-1)
clf.fit(stacked_features_train, y_train)
compute_metric(clf, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

100%|██████████| 2/2 [00:05<00:00,  2.59s/it]


0.986498

In [42]:
def compute_predicts(classifiers, X_train, y_train, X_test):
    predicts = []
    for clf in classifiers:
        clf.fit(X_train, y_train)
        predicts.append(clf.predict_proba(X_test))
    return predicts

predicts = compute_predicts([ 
    RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42), 
    LogisticRegression(random_state=42),
], X_train, y_train, X_test)

y_test_pred = np.sum(predicts, axis=0).argmax(axis=1)
print(np.round(f1_score(y_test, y_test_pred, average='macro'), 6))


0.976259


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
