# 6.2 Стекинг

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from matplotlib import pyplot as plt

%matplotlib inline

In [38]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', sep=',', header=None)[:10000]

In [39]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2


In [40]:
df.shape

(10000, 55)

In [41]:
features = list(range(0, 54))
target = 54

df = df[(df[target] == 1) | (df[target] == 2)]

In [42]:
cover_train, cover_test = train_test_split(df, test_size=0.5)

cover_X_train, cover_y_train = cover_train[features], cover_train[target]
cover_X_test, cover_y_test = cover_test[features], cover_test[target]

In [43]:
scaler = StandardScaler()
cover_X_train = scaler.fit_transform(cover_X_train)
cover_X_test = scaler.transform(cover_X_test)
X_train=cover_X_train
X_test=cover_X_test

In [44]:
X_train=cover_X_train
X_test=cover_X_test
y_train=cover_y_train
y_test=cover_y_test

Stacking — еще один способ объединить несколько алгоритмов в один, который часто используется как в решении реальных задач из промышленной сферы, так и в конкурсах на платформах вроде Kaggle.  
Подход использует понятие _базовых классификаторов_, каждый из которых независимо обучается на некотором (возможно одном и том же) множестве признаков, а также _мета-классификатора_, использующего предсказания базовых классификаторов как признаки.

Для избежания переобучения будем разбивать обучающую выборку на фолды.  
Например, фолды при разбиении на три части:  
``==*``  
``=*=``  
``*==``  

Это требуется для того, чтобы получить новые признаки (ответы алгоритмов на первом уровне) на всей обучающей выборке, т.е. ответы алгоритма на тех объектах, которые не были использованы во время обучения. В примере выше мы будем использовать ответы алгоритма, полученные на объектах звездочках. _Важно_: на каждом фолде мы обучаем алгоритм заново.

In [45]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    """
    Computes meta-features using the classifier.
    
    :arg clf: scikit-learn classifier
    :args X_train, y_train: training set
    :arg X_test: testing set
    :arg cv: cross-validation folding
    """
    X_meta_train = np.zeros_like(y_train, dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)[:, 1]
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)[:, 1]
    
    return X_meta_train, X_meta_test

In [46]:
def generate_metafeatures(classifiers, X_train, X_test, y_train, cv):
    """
    Generates metafeatures using a list of classifiers.
    
    :arg classifiers: list of scikit-learn classifiers
    :args X_train, y_train: training set
    :arg X_test: testing set
    :arg cv: cross-validation folding
    """
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.vstack([
        features_train for features_train, features_test in features
    ]).T

    stacked_features_test = np.vstack([
        features_test for features_train, features_test in features
    ]).T
    
    return stacked_features_train, stacked_features_test

In [47]:
np.random.seed(42)

In [48]:
clf = GradientBoostingClassifier(n_estimators=300)
clf.fit(cover_X_train, cover_y_train)

accuracy_score(clf.predict(cover_X_test), cover_y_test)

0.7970401691331924

In [49]:
cv = KFold(n_splits=10, shuffle=True)

stacked_features_train, stacked_features_test = generate_metafeatures([
    LogisticRegression(C=0.001, penalty='l1', solver='liblinear', max_iter=5000),
    LogisticRegression(C=0.001, penalty='l2', solver='liblinear', max_iter=5000),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    GradientBoostingClassifier(n_estimators=300)
], cover_X_train, cover_X_test, cover_y_train.values, cv)


  0%|          | 0/4 [00:00<?, ?it/s][A
 50%|█████     | 2/4 [00:00<00:00, 10.07it/s][A
 75%|███████▌  | 3/4 [00:16<00:05,  5.08s/it][A
100%|██████████| 4/4 [00:42<00:00, 10.63s/it][A


In [50]:
total_features_train = np.hstack([cover_X_train, stacked_features_train])
total_features_test = np.hstack([cover_X_test, stacked_features_test])

In [51]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs')
clf.fit(stacked_features_train, cover_y_train)
accuracy_score(clf.predict(stacked_features_test), cover_y_test)

0.8047921071176886

In [52]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

In [53]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    """    Эта функция подсчитывает признаки для мета-классификатора.     Они являются вероятностями классов при решении задачи многоклассовой классификации.    :arg clf: классификатор    :args X_train, y_train: обучающая выборка    :arg X_test: признаки тестовой выборки    :arg cv: класс, генерирующий фолды (KFold)    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок    """
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)

        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)

    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)

    X_meta_test = meta_clf.predict_proba(X_test)

    return X_meta_train, X_meta_test

In [54]:
def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    """
    Эта функция подсчитывает признаки для мета-классификатора. 
    Они являются вероятностями классов при решении задачи многоклассовой классификации.

    :arg clf: классификатор
    :args X_train, y_train: обучающая выборка
    :arg X_test: признаки тестовой выборки
    :arg cv: класс, генерирующий фолды (KFold)

    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок
    """
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_tests_array = []
    splits = 0
    for train_fold_index, predict_fold_index in cv.split(X_train):
        n_classes = len(np.unique(y_test))
        X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
        splits += 1
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        X_meta_tests_array.append(folded_clf.predict_proba(X_test))
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = sum(X_meta_tests_array) / splits
    return X_meta_train, X_meta_test

In [55]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test


cv = KFold(n_splits=10, shuffle=True, random_state=42)



In [56]:
stacked_features_train, stacked_features_test = generate_metafeatures([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000),  
    RandomForestClassifier(n_estimators=300),
    GradientBoostingClassifier(n_estimators=200)
], cover_X_train, cover_X_test, cover_y_train.values, cv)

np.random.seed(42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

compute_metric(LogisticRegression(multi_class='auto',solver='lbfgs'),stacked_features_train,stacked_features_test,)


  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  9.10it/s][A
 50%|█████     | 2/4 [00:00<00:00,  5.20it/s][A
 75%|███████▌  | 3/4 [00:19<00:05,  5.94s/it][A
100%|██████████| 4/4 [00:32<00:00,  8.19s/it][A


ValueError: y should be a 1d array, got an array of shape (2, 5676) instead.

In [None]:
stacked_features_train.shape

In [None]:
stacked_features_test.shape

In [62]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

stacked_features_train, stacked_features_test = generate_metafeatures([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000),  
    RandomForestClassifier(n_estimators=300),
    GradientBoostingClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)

#total_features_train = np.hstack([X_train, stacked_features_train])
#total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 4/4 [04:00<00:00, 60.18s/it]


ValueError: Found input variables with inconsistent numbers of samples: [10, 1437]

In [64]:
#6.6.2

dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000),  
    RandomForestClassifier(n_estimators=300),
    GradientBoostingClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)



total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 4/4 [03:51<00:00, 57.91s/it]


0.986494

In [65]:
#6.6.3
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

stacked_features_train, stacked_features_test = generate_meta_features([  
    RandomForestClassifier(n_estimators=300),
    ExtraTreesClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)



total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 2/2 [00:21<00:00, 10.83s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.987357

In [66]:
#6.6.4
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

stacked_features_train, stacked_features_test = generate_meta_features([  
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)



total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 2/2 [00:08<00:00,  4.06s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.990099

In [70]:
# 6.6.5
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga',
                       multi_class='ovr',max_iter=2000),
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300),
    AdaBoostClassifier()
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)




  0%|          | 0/4 [00:00<?, ?it/s][A[A[A


 25%|██▌       | 1/4 [01:12<03:38, 72.74s/it][A[A[A


 50%|█████     | 2/4 [01:13<01:42, 51.08s/it][A[A[A


 75%|███████▌  | 3/4 [01:24<00:39, 39.04s/it][A[A[A


100%|██████████| 4/4 [01:27<00:00, 21.79s/it][A[A[A


0.989507

In [71]:
# 6.6.6
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)




  0%|          | 0/2 [00:00<?, ?it/s][A[A[A


 50%|█████     | 1/2 [00:13<00:13, 13.20s/it][A[A[A


100%|██████████| 2/2 [00:23<00:00, 12.00s/it][A[A[A


0.981637

In [72]:
# 6.6.7
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)




  0%|          | 0/4 [04:51<?, ?it/s][A[A[A
  0%|          | 0/4 [04:34<?, ?it/s]



 50%|█████     | 1/2 [00:26<00:26, 26.49s/it][A[A[A


100%|██████████| 2/2 [00:47<00:00, 23.93s/it][A[A[A


0.983371

In [73]:
# 6.6.8
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = RandomForestClassifier()
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 2/2 [00:13<00:00,  6.61s/it]


0.982221

In [74]:
# 6.6.9
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = KNeighborsClassifier()
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 2/2 [00:12<00:00,  6.44s/it]


0.98417

In [75]:
# 6.6.10
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = GradientBoostingClassifier()
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 2/2 [00:12<00:00,  6.49s/it]


0.986659

In [76]:
# 6.6.11
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300,criterion='gini',max_depth=24),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = ExtraTreesClassifier(n_estimators=100)
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

100%|██████████| 2/2 [00:07<00:00,  3.87s/it]


0.985118

In [77]:
def compute_predicts(classifiers, X_train, y_train, X_test):
    predicts = []
    for clf in tqdm(classifiers):
        clf.fit(X_train, y_train)
        predicts.append(clf.predict(X_test))
    return predicts
predicts = compute_predicts([ 
    RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42), 
    LogisticRegression()
], X_train, y_train, X_test)
y_test_pred = np.mean(predicts, axis = 0, dtype='int')
print(np.round(f1_score(y_test, y_test_pred, average='macro'), 6))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
100%|██████████| 3/3 [00:02<00:00,  1.25it/s]

0.965569





In [None]:
# 6.6.12
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2)


def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(
            X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)


def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)


stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300,criterion='gini',max_depth=24),
    ExtraTreesClassifier(n_estimators=300)
], X_train, X_test, y_train, cv)


total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

np.random.seed(42)
clf = ExtraTreesClassifier(n_estimators=100)
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)