# Композиции классификаторов (градиентный бустинг)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# !pip install catboost

## Библиотеки

In [2]:
import matplotlib.pyplot as plt
from matplotlib.image import imread
from mpl_toolkits import mplot3d
from copy import deepcopy
from mlxtend.plotting import plot_decision_regions
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm
from scipy.spatial.distance import cdist
import numpy as np
from sklearn import tree, base
import itertools
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, BaggingClassifier)
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.datasets import make_classification, make_regression, load_wine, load_boston
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from torchvision import datasets, transforms
import torch

import xgboost as xgb
from catboost import CatBoostClassifier, Pool

## ComBoost

In [3]:
class ComBoost(object):
    def __init__(self, base_estimator=None, n_estimators=10):
        self.n_estimators = n_estimators
        self.base_estimator = DecisionTreeClassifier(max_depth=1)
        if base_estimator:
            self.base_estimator = base_estimator
        self.b = [base.clone(self.base_estimator) for _ in range(self.n_estimators)]
        
    def get_params(self, deep=True):
        return {'n_estimators': self.n_estimators, 
                'base_estimator': self.base_estimator}

    @staticmethod
    def fix_predict_proba(pred, b, b0):
        new_pred = np.zeros((len(pred), len(b0.classes_)))
        for i, cl in enumerate(b.classes_):
            new_pred[:, cl] = pred[:, i]
        return new_pred
        
    def fit(self, X, Y, l0=0, l1=100, l2=None, dl=100):
        def margin(pr, y):
            cop = pr.copy()
            cop[y] = -1
            return pr[y] - cop.max()
        
        if l2 is None:
            l2 = len(X)
        
        for t, b in enumerate(self.b):
            if t == 0:
                b.fit(X, Y)
                pred = b.predict_proba(X)
                M = np.array([margin(pred[i], Y[i]) for i in range(len(Y))])
            else:
                indexes = sorted(np.arange(0, len(X)), key = lambda i: M[i])
                X_new = X[indexes]
                Y_new = Y[indexes]
                dict_of_param = []
                for k in range(l1, l2, dl):
                    new_item = {'l0': l0, 
                                'k': k}
                    
                    local_b = base.clone(self.base_estimator)
                    local_b.fit(X_new[l0:k], Y_new[l0:k])
                    
                    pred = self.fix_predict_proba(local_b.predict_proba(X), local_b, self.b[0])
                    M_new = np.array([margin(pred[i], Y[i]) for i in range(len(Y))])
                    
                    new_item['Q'] = (M+M_new < 0).sum()
                    dict_of_param.append(new_item)
                    
                element = sorted(dict_of_param, key=lambda x: x['Q'])[0]
                b.fit(X_new[element['l0']:element['k']], 
                      Y_new[element['l0']:element['k']])
                
                pred = self.fix_predict_proba(b.predict_proba(X), local_b, self.b[0])
                M = M + np.array([margin(pred[i], Y[i]) for i in range(len(Y))])
                
                    
    def predict(self, X):
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)
    
    def predict_proba(self, X):
        return np.mean([self.fix_predict_proba(elem.predict_proba(X), elem, self.b[0]) for elem in self.b], axis=0)

In [4]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)
model = ComBoost(DecisionTreeClassifier(max_depth=2))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.803 (0.039)


In [5]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)
model = ComBoost(SVC(probability=True))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.945 (0.018)


In [6]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)
model = ComBoost(LogisticRegression())

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.798 (0.034)


## Gradient Boosting

In [7]:
class GradientBoostingRegression(object):
    def __init__(self, base_estimator=None, n_estimators=10):
        self.n_estimators = n_estimators
        self.base_estimator = DecisionTreeRegressor(max_depth=1)
        if base_estimator:
            self.base_estimator = base_estimator
            
        self.b = [base.clone(self.base_estimator) for _ in range(self.n_estimators)]
        
    def get_params(self, deep=True):
        return {'n_estimators': self.n_estimators, 
                'base_estimator': self.base_estimator}
        
    def score(self, X, Y):
        return ((self.predict(X) - Y)**2).mean()
        
    def fit(self, X, Y):
        residual = Y.copy()
        for t, b in enumerate(self.b):
            b.fit(X, residual)
            residual -= b.predict(X)
            
    def predict(self, X):
        return np.sum([elem.predict(X) for elem in self.b], axis=0)
    

In [8]:
X, y = make_regression(n_samples=1000, n_features=20, random_state=6)
model = GradientBoostingRegression(DecisionTreeRegressor(max_depth=2))

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 5392.988 (837.368)


In [9]:
X, y = make_regression(n_samples=1000, n_features=20, random_state=6)
model = GradientBoostingRegression(SVR())

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 5340.377 (1169.583)


In [10]:
X, y = make_regression(n_samples=1000, n_features=20, random_state=6)
model = GradientBoostingRegression(SVR(kernel='linear'))

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 0.002 (0.000)


In [11]:
X, y = make_regression(n_samples=1000, n_features=20, random_state=6)
model = GradientBoostingRegression(LinearRegression())

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 0.000 (0.000)


## XGBoost

In [12]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)
model = xgb.XGBClassifier(objective='binary:logistic', random_state=6)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 0.883 (0.034)


In [13]:
X, y = make_regression(n_samples=1000, n_features=20, random_state=6)
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=6)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 0.933 (0.009)


## CatBoost

In [14]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)

model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True, task_type='CPU')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 0.749 (0.034)


In [15]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=6)

model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True, task_type='GPU')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
print('SCORE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

SCORE: 0.726 (0.041)
