In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import (train_test_split,
                                     cross_val_score,
                                     KFold,
                                     StratifiedKFold,
                                     RandomizedSearchCV)
from sklearn.ensemble import (BaggingClassifier,
                              RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier,
                              ExtraTreesClassifier,
                              StackingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from scipy.stats.distributions import randint
from sklearn.datasets import load_digits
import warnings
warnings.simplefilter('ignore')

In [2]:
dataset = load_digits()
#print(dataset['DESCR'])

In [3]:
def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
    count = 0
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)     
        X_meta_test += folded_clf.predict_proba(X_test)
        count += 1
    X_meta_test = X_meta_test/count
    return X_meta_train, X_meta_test

In [4]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [5]:
def compute_meta_feature_strat(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [6]:
def generate_meta_feature(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [7]:
def generate_meta_feature_strat(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature_strat(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [8]:
def compute_metric(clf, X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [9]:
X, y = dataset['data'], dataset['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [12]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

In [13]:
stacked_features_train, stacked_features_test = generate_meta_feature([
    LogisticRegression(C=0.001, penalty='l1', solver='saga',
                       multi_class='ovr', max_iter=2000, random_state=42),
    LogisticRegression(C=0.001, penalty='l2', solver='saga',
                       multi_class='multinomial', max_iter=2000, random_state=42),
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 4/4 [01:58<00:00, 29.54s/it]


In [14]:
clf_meta = LogisticRegression(
    penalty='none', multi_class='auto', solver='lbfgs', random_state=42)

In [15]:
compute_metric(clf_meta, X_train=stacked_features_train, y_train=y_train, X_test=stacked_features_test)

0.984254

In [16]:
stacked_features_train_1, stacked_features_test_1 = generate_meta_feature([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=200, n_jobs=-1, random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:08<00:00,  4.41s/it]


In [17]:
compute_metric(clf_meta, X_train=stacked_features_train_1, y_train=y_train, X_test=stacked_features_test_1)

0.977606

In [18]:
stacked_features_train_2, stacked_features_test_2 = generate_meta_feature([
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:04<00:00,  2.43s/it]


In [19]:
compute_metric(clf_meta, X_train=stacked_features_train_2, y_train=y_train, X_test=stacked_features_test_2)

0.978789

Задание 6.6.4 - Ответ 0.978789 не совпадает с ответом на сайте 0.989904 или 0.988991

In [20]:
stacked_features_train_3, stacked_features_test_3 = generate_meta_feature([
    LogisticRegression(C=0.001, penalty='l1', solver='saga',
                       multi_class='ovr', max_iter=2000, random_state=42),
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    AdaBoostClassifier(random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 4/4 [00:06<00:00,  1.69s/it]


In [21]:
compute_metric(clf_meta, X_train=stacked_features_train_3,
               y_train=y_train, X_test=stacked_features_test_3)

0.980527

Задание 6.6.5 - Ответ 0.980527 не совпадает с ответом на сайте 0.987404 или 0.983441

In [22]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [23]:
stacked_features_train_4, stacked_features_test_4 = generate_meta_feature_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:13<00:00,  6.69s/it]


In [24]:
compute_metric(clf_meta, X_train=stacked_features_train_4, y_train=y_train, X_test=stacked_features_test_4)

0.98179

Задание 6.6.6 - ответ 0.98179 не сопадает с ответом на сайте 0.979448 или 0.983918 или 0.975562

In [25]:
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

In [26]:
stacked_features_train_5, stacked_features_test_5 = generate_meta_feature_strat([
    RandomForestClassifier(n_estimators=300, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:27<00:00, 13.58s/it]


In [27]:
compute_metric(clf_meta, X_train=stacked_features_train_5, y_train=y_train, X_test=stacked_features_test_5)

0.975387

Задание 6.6.7 - ответ 0.975387 не совпадает сответом 0.985935 или 0.984228

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [29]:
stacked_features_train_6, stacked_features_test_6 = generate_meta_feature_strat([
    RandomForestClassifier(n_estimators=300,
                           random_state=42),
    ExtraTreesClassifier(n_estimators=300,
                         n_jobs=-1,
                         random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:06<00:00,  3.48s/it]


In [30]:
compute_metric(clf_meta, X_train=stacked_features_train_6,
               y_train=y_train, X_test=stacked_features_test_6)

0.974417

Задание 6.6.8 - не совпадает ответ 0.974417 с ответом на сайте 0.981661 или 0.978222

In [31]:
clf_meta_2 = KNeighborsClassifier()

In [32]:
compute_metric(clf_meta_2, X_train=stacked_features_train_6,
               y_train=y_train, X_test=stacked_features_test_6)

0.98417

In [33]:
clf_meta_3 = GradientBoostingClassifier(random_state=42)

In [34]:
compute_metric(clf_meta_3, X_train=stacked_features_train_6,
               y_train=y_train, X_test=stacked_features_test_6)

0.984925

In [35]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [36]:
stacked_features_train_7, stacked_features_test_7 = generate_meta_feature_strat([
    RandomForestClassifier(n_estimators=300,
                           criterion='gini',
                           max_depth=24,
                           random_state=42),
    ExtraTreesClassifier(n_estimators=300,
                         n_jobs=-1,
                         random_state=42)], X_train, X_test, y_train, cv=cv)

100%|██████████| 2/2 [00:04<00:00,  2.27s/it]


In [37]:
clf_meta_4 = RandomForestClassifier(n_estimators=100, random_state=42)

In [38]:
compute_metric(clf_meta_4, X_train=stacked_features_train_7,
               y_train=y_train, X_test=stacked_features_test_7)

0.98649

In [39]:
def compute_predicts(classifiers, X_train, y_train, X_test):
    predicts = []
    for clf in classifiers:
        clf.fit(X_train, y_train)
        predicts.append(clf.predict_proba(X_test))
    return predicts

In [40]:
predicts = compute_predicts([ 
    RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42), 
    LogisticRegression()
], X_train, y_train, X_test)

y_test_pred = np.sum(predicts, axis=0).argmax(axis=1)
print(np.round(f1_score(y_test, y_test_pred, average='macro'), 6))

0.973967


In [42]:
import sklearn
sklearn.__version__

'0.24.2'