# Stacking + Boosting

In [1]:
# https://www.kaggle.com/c/titanic

In [2]:
import sklearn
import pandas as pd

In [3]:
# Загружаем данные из файлов
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Предобработка данных

In [5]:
# Заполняем пропуски в данных медианными 
# значениями факторов на обучающей выборке
train_median = train.median()
train_imp = train.fillna(train_median)
test_imp = test.fillna(train_median)

In [6]:
# Бинаризуем категориальные признаки
CATEGORY_COL = ['Sex', 'Pclass', 'Embarked']
train_dummies = pd.get_dummies(train_imp, columns=CATEGORY_COL, drop_first=True)
test_dummies = pd.get_dummies(test_imp, columns=CATEGORY_COL, drop_first=True)

In [7]:
train_dummies.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1,0,1


In [8]:
# Удаляем лишние столбцы
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']
TARGET_COL = 'Survived'
X_train = train_dummies.drop(DROP_COL + [TARGET_COL], axis=1)
y_train = train_dummies[TARGET_COL]
X_test = test_dummies.drop(DROP_COL, axis=1)

## Тюнинг моделей. Зададим сетку параметров

In [11]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

params_grid = { # параметры для RandomForest, которые будем тюнить
    'n_estimators': [1, 2, 3, 10, 35],
    'min_samples_split': [2, 5, 10]
}

## Тюнинг моделей. Способ 1
"В лоб"

In [12]:
from sklearn.metrics import roc_auc_score

kf = KFold(n_splits=4, shuffle=True) # Всегда делайте shuffle если обучаете не на последовательных данных!

# Переберём все возможные комбинации параметров
params = [{}]
for parameter_name in params_grid:
    parameter_values = params_grid[parameter_name]
    new_params = []
    for value in parameter_values:    
        for param in params:
            updated_param = param.copy()
            updated_param[parameter_name] = value
            new_params.append(updated_param)
    params = new_params
    
# Выберем из всех вариаций параметров наилучшую
best_params = {}
best_auc = 0
for param in params:
    print(('Training RandomForest with params: ', param))
    clf.set_params(**param)
    
    fold_aucs = []
    for train_idx, test_idx in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
        clf.fit(X_train_fold, y_train_fold)
        preds = clf.predict_proba(X_test_fold)
        auc = roc_auc_score(y_test_fold, preds[:, 1])
        fold_aucs.append(auc)
    auc = np.mean(fold_aucs)
    print(('AUC: ', auc))
    if auc > best_auc:
        best_params = param
        best_auc = auc

print('Best params:')
best_params

('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 2})
('AUC: ', 0.75669994153450038)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 2})
('AUC: ', 0.80525532343921502)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 2})
('AUC: ', 0.81776892721453509)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samples_split': 2})
('AUC: ', 0.84056033625920712)
('Training RandomForest with params: ', {'n_estimators': 35, 'min_samples_split': 2})
('AUC: ', 0.85774410712348037)
('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 5})
('AUC: ', 0.76585870888765983)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 5})
('AUC: ', 0.8127186797178243)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 5})
('AUC: ', 0.8256927548898958)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_sample

{'min_samples_split': 10, 'n_estimators': 35}

## Тюнинг моделей. Способ 2
Используем GridSearchCV cо своим KFold

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

# функция, скор которой будет выводиться в гридсёче
roc_scorer = make_scorer(lambda y_true, y_pred: roc_auc_score(y_true, y_pred[:, 1]), needs_proba=True)
kf = KFold(n_splits=4, shuffle=True)
gs = GridSearchCV(clf, param_grid=params_grid, verbose=5, scoring=roc_scorer, cv=kf)
# запуск гридсёча
gs.fit(X_train, y_train)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.720588, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.769523, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.724734, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.724094, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]  min_samples_split=2, n_estimators=2, score=0.783342, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]  min_samples_split=2, n_estimators=2, score=0.842796, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s



[CV] min_samples_split=2, n_estimators=10 ............................
[CV]  min_samples_split=2, n_estimators=10, score=0.836654, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.832953, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.903453, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.835474, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.832122, total=   0.0s
[CV] min_samples_split=5, n_estimators=1 .............................
[CV]  min_samples_split=5, n_estimators=1, score=0.813007, total=   0.0s
[CV] min_samples_split=5, n_estimators=1 .............................
[CV]  min_samples_split=5, n_estimators=1, score=0.806820, 

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    1.4s finished


GridSearchCV(cv=KFold(n_splits=4, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=35, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 2, 3, 10, 35], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(<lambda>, needs_proba=True), verbose=5)

In [14]:
gs.best_score_, gs.best_params_

(0.8653304221497079, {'min_samples_split': 10, 'n_estimators': 10})

In [15]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

## Тюнинг моделей. Способ 3
Ипользуем GridSearchCV со встроенным KFold и встроенной метрикой качества

In [16]:
gs = GridSearchCV(clf, param_grid=params_grid, verbose=5, cv=4)
gs.fit(X_train, y_train)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.741071, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.811659, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.815315, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.743243, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]  min_samples_split=2, n_estimators=2, score=0.745536, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]  min_samples_split=2, n_estimators=2, score=0.820628, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


[CV]  min_samples_split=2, n_estimators=10, score=0.810811, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.754464, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.829596, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.815315, total=   0.0s
[CV] min_samples_split=2, n_estimators=35 ............................
[CV]  min_samples_split=2, n_estimators=35, score=0.819820, total=   0.0s
[CV] min_samples_split=5, n_estimators=1 .............................
[CV]  min_samples_split=5, n_estimators=1, score=0.758929, total=   0.0s
[CV] min_samples_split=5, n_estimators=1 .............................
[CV]  min_samples_split=5, n_estimators=1, score=0.789238, total=   0.0s
[CV] min_samples_split=5, n_estimators=1 .................

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    1.3s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=35, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 2, 3, 10, 35], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=5)

## Тюнинг моделей. Способ 4
Используя OOB-score (работает только для НЕбустинговых ансамблей)

In [17]:
from sklearn.metrics import roc_auc_score

kf = KFold(n_splits=4, shuffle=True) # Всегда делайте shuffle если обучаете не на последовательных данных!

best_score = 0
best_params = {}

# Опять создадим всевозможные комбинации параметров модели
params = [{}]
for parameter_name in params_grid:
    parameter_values = params_grid[parameter_name]
    new_params = []
    for value in parameter_values:    
        for param in params:
            updated_param = param.copy()
            updated_param[parameter_name] = value
            new_params.append(updated_param)
    params = new_params
    
for param in params:
    print(('Training RandomForest with params: ', param))
    clf.set_params(**param)
    clf.set_params(oob_score=True)
    
    # Это то, что нам требовалось делать раньше
#     fold_aucs = []
#     for train_idx, test_idx in kf.split(X_train):
#         X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
#         y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
#         clf.fit(X_train_fold, y_train_fold)
#         preds = clf.predict_proba(X_test_fold)
#         auc = roc_auc_score(y_test_fold, preds[:, 1])
#         fold_aucs.append(auc)
#     print(('AUC: ', np.mean(fold_aucs)))

    # А это то, что мы можем делать сейчас без всех махинаций с KFold'ом выше
    clf.fit(X_train, y_train)
    oob_score = clf.oob_score_
    
    print(('OOB: ', oob_score))
    if oob_score > best_score:
        best_score = oob_score
        best_params = param

print('Best params:')
best_params

('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 2})
('OOB: ', 0.64983164983164987)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 2})
('OOB: ', 0.72502805836139173)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 2})
('OOB: ', 0.74186307519640848)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samples_split': 2})
('OOB: ', 0.80134680134680136)
('Training RandomForest with params: ', {'n_estimators': 35, 'min_samples_split': 2})
('OOB: ', 0.81369248035914699)
('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 5})
('OOB: ', 0.68462401795735128)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 5})
('OOB: ', 0.7048260381593715)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 5})
('OOB: ', 0.7407407407407407)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_sample

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


('OOB: ', 0.81930415263748602)
('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 10})
('OOB: ', 0.68686868686868685)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 10})
('OOB: ', 0.69248035914702577)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 10})
('OOB: ', 0.7407407407407407)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samples_split': 10})
('OOB: ', 0.8125701459034792)
('Training RandomForest with params: ', {'n_estimators': 35, 'min_samples_split': 10})
('OOB: ', 0.82154882154882158)
Best params:


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


{'min_samples_split': 10, 'n_estimators': 35}

## Практическое задание 1
Используя понравившийся метод, попробуйте подобрать самые важные на ваш взгляд параметры для RandomForestClassifier и GradientBoostingClassifier

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
clf = GradientBoostingClassifier()

In [21]:
?GradientBoostingClassifier

In [22]:
param_grid = {"max_depth" : [4, 3, 5]}
gs = GridSearchCV(clf, param_grid=param_grid, verbose=5, cv=4)
gs.fit(X_train, y_train)

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV] max_depth=4 .....................................................
[CV] ...................... max_depth=4, score=0.808036, total=   0.0s
[CV] max_depth=4 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ...................... max_depth=4, score=0.856502, total=   0.0s
[CV] max_depth=4 .....................................................
[CV] ...................... max_depth=4, score=0.819820, total=   0.0s
[CV] max_depth=4 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s


[CV] ...................... max_depth=4, score=0.837838, total=   0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.799107, total=   0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.856502, total=   0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.810811, total=   0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.837838, total=   0.0s
[CV] max_depth=5 .....................................................
[CV] ...................... max_depth=5, score=0.790179, total=   0.0s
[CV] max_depth=5 .....................................................
[CV] ...................... max_depth=5, score=0.865471, total=   0.0s
[CV] max_depth=5 .....................................................
[CV] .

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    1.3s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [4, 3, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=5)

## Предсказание моделей для стеккинга

## Предсказание моделей для стекинга. Способ 1
"В лоб"

In [23]:
def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    estimator_scores = np.zeros_like(y_train)
    for train_idx, test_idx in kfold.split(X_train):
        X_train_fold, X_pred_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, _ = y_train.iloc[train_idx], y_train.iloc[test_idx]
        estimator.fit(X_train_fold, y_train_fold)
        estimator_scores[test_idx] = estimator.predict_proba(X_test_fold)[:, 1]
    return estimator_scores
#     return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# инициализирем модели с подобранными гиперпараметрами
rf_estimator = RandomForestClassifier()
gb_estimator = GradientBoostingClassifier()

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)

X_train_stack = np.stack([rf_train_pred, gb_train_pred], axis=1)

# получаем предсказания ансамблей для тестовой выборки
rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)

X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1]], axis=1)

  
  
  
  


In [24]:
X_test_stack

array([[ 0.1       ,  0.04635935],
       [ 0.3       ,  0.12779691],
       [ 0.3       ,  0.13986669],
       [ 0.8       ,  0.14068034],
       [ 0.4       ,  0.4036792 ],
       [ 0.1       ,  0.11170511],
       [ 0.1       ,  0.24903108],
       [ 0.        ,  0.28030803],
       [ 0.5       ,  0.90190717],
       [ 0.        ,  0.08107034],
       [ 0.        ,  0.09280331],
       [ 0.33333333,  0.07143607],
       [ 1.        ,  0.933683  ],
       [ 0.2       ,  0.12858014],
       [ 1.        ,  0.86025295],
       [ 1.        ,  0.9270053 ],
       [ 0.1       ,  0.07317554],
       [ 0.7       ,  0.16838866],
       [ 0.5       ,  0.53839304],
       [ 0.7       ,  0.37241087],
       [ 0.6       ,  0.28427084],
       [ 0.6       ,  0.49216374],
       [ 1.        ,  0.94539501],
       [ 0.6       ,  0.40325586],
       [ 0.8       ,  0.93250822],
       [ 0.        ,  0.04924125],
       [ 1.        ,  0.95974208],
       [ 0.6       ,  0.16838866],
       [ 0.55      ,

## Предсказание моделей для стекинга. Способ 2
Красивый с использованием метода cross_val_predict()

In [25]:
from sklearn.model_selection import cross_val_predict

def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# TODO: подобрать гиперпараметры для ансамблей

# инициализирем модели с подобранными гиперпараметрами
rf_estimator = RandomForestClassifier()
gb_estimator = GradientBoostingClassifier()

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)

X_train_stack = np.stack([rf_train_pred[:, 1], gb_train_pred[:, 1]], axis=1)

# получаем предсказания ансамблей для тестовой выборки
rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)

X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1]], axis=1)

In [81]:
X_test_stack

array([[ 0.1       ,  0.04635935],
       [ 0.1       ,  0.12779691],
       [ 0.5       ,  0.13986669],
       [ 0.6       ,  0.14068034],
       [ 0.3       ,  0.4036792 ],
       [ 0.3       ,  0.11170511],
       [ 0.3       ,  0.24903108],
       [ 0.2       ,  0.28030803],
       [ 1.        ,  0.90190717],
       [ 0.        ,  0.08107034],
       [ 0.        ,  0.09280331],
       [ 0.1       ,  0.07143607],
       [ 1.        ,  0.933683  ],
       [ 0.2       ,  0.12858014],
       [ 1.        ,  0.86025295],
       [ 0.9       ,  0.9270053 ],
       [ 0.1       ,  0.07317554],
       [ 0.7       ,  0.16838866],
       [ 0.5       ,  0.53839304],
       [ 0.4       ,  0.37241087],
       [ 0.2       ,  0.28427084],
       [ 0.5       ,  0.49216374],
       [ 1.        ,  0.94539501],
       [ 0.2       ,  0.40325586],
       [ 1.        ,  0.93250822],
       [ 0.1       ,  0.04924125],
       [ 1.        ,  0.95974208],
       [ 0.6       ,  0.16838866],
       [ 0.6       ,

## Объединяем предсказания ансамблей с помощью логистической регрессии

In [26]:
from sklearn.linear_model import LogisticRegression

# TODO: подобрать гиперпараметры LogisticRegression

logreg = LogisticRegression().fit(X_train_stack, y_train)
predicted = logreg.predict(X_test_stack)

## Формируем файл для отправки

In [27]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))