# MLClass. "Прикладной анализ данных"
# Модуль "Машинное обучение с помощью Python"
<img src="../img/mlclass_logo.jpg" height="240" width="240">
## Автор материала: преподаватель ФКН НИУ ВШЭ Кашницкий Юрий
Материал распространяется на условиях лицензии <a href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-Share Alike 4.0</a>. Можно использовать в любых целях, но с обязательным упоминанием автора курса и аффилиации.

# Урок 6. Нейронные сети. Бустинг. Смешивание алгоритмов. Стекинг.
## Часть 7. Использование API Scikit-learn

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from load_titanic_with_features import load_titanic
from sklearn import cross_validation

class MyBlackBox(BaseEstimator):
    def __init__(self, base_classifiers={GradientBoostingClassifier(): {},
                                        RandomForestClassifier(): {},
                                        LogisticRegression(): {}},
                                        verbose=True, n_jobs=4, cv=5):
        self.base_classifiers = base_classifiers
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.cv = cv

    def get_params(self, deep=True):
        return BaseEstimator.get_params(self, deep=deep)

    def set_params(self, **params):
        return BaseEstimator.set_params(self, **params)

    def fit(self, X, y):
        self.clf_weights = {}
        for clf in self.base_classifiers:
            print(clf)
            params = self.base_classifiers[clf]
            current_best_clf = GridSearchCV(clf,
                                 params,
                                 verbose=self.verbose, n_jobs=self.n_jobs, cv=self.cv)
            current_best_clf.fit(X, y)
            self.clf_weights[current_best_clf.best_estimator_] = current_best_clf.best_score_
        print(self.clf_weights)

    def predict(self, X):
        final_predictions = np.zeros([X.shape[0], 1])
        sum_clf_weights = sum(self.clf_weights.values())
        clf_weights = [weight / sum_clf_weights
                       for weight in self.clf_weights.values()]

        for clf in self.clf_weights:
            final_predictions += self.clf_weights[clf] / sum_clf_weights * clf.predict(X).reshape([X.shape[0], 1])
        # print(final_predictions)
        return (final_predictions > 0.5).astype('int64')

In [None]:
X_train, y, X_test = load_titanic("../data/titanic_train.csv",
                                "../data/titanic_test.csv")


forest_params = {'criterion': ('gini', 'entropy'),
                 'n_estimators': list(range(50, 300, 50)),
                 'max_depth': list(range(1, 5)),
                 'min_samples_leaf': list(range(1, 5))}

gboost_params = {'learning_rate': [0.1, 0.2, 0.3],
                 'n_estimators': list(range(10, 100,20)),
                 'max_depth': list(range(1,5)),
                 'min_samples_leaf': list(range(1,5)),
                 'min_samples_split': list(range(1,5))}

log_reg_params = {'C': [0.1, 5, 10, 50]}

clf = MyBlackBox(base_classifiers={GradientBoostingClassifier(): gboost_params,
                                   RandomForestClassifier(): forest_params,
                                   LogisticRegression(): log_reg_params},
                     cv=3)

clf.fit(X_train, y)

scores = cross_validation.cross_val_score(clf, X_train,
                                          y, cv=3, scoring="accuracy")

In [7]:
print(np.mean(scores))

predictions = clf.predict(X_test)

predicted_df = pd.DataFrame(predictions,
                            index = np.arange(892, 892 + X_test.shape[0]),
                            columns=["Survived"])
predicted_df.to_csv("../output/titanic_myblackbox.csv", 
                    index_label="PassengerId")

0.829405162738


**У такой посылки на Kaggle результат 0.7799**