# Лабораторная работа 4-5 - XGBoost и градиентный бустинг

In [2]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

In [3]:
df = pd.read_csv('bank-full.csv', sep = ';')
df.head()  # смотрим на данные

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Подготовка данных

Перекодируем качественные предикторы:

1. для **default**, **housing** , **loan** переведём yes/no в 1/0

2. **marital** - 1, если состоит в браке (married), 0 - одинокий/разведён (single/divorced)

3. **education** - переведём в ранговую переменную: 0 - unknown, 1 - primary, 2 - secondary, 3 -tertiary

4. **job** - 0 - unemployed/housemaid/unknown/retired, 1 - student, 2 - работник любой профессии

5. **poutcome** - 0 - unknown/failure, 1 - other/success

6. **contact** - 0 - unknown, 1 - cellular/telephone

Не будем учитывать предикторы **day** и **month**, отвечающие за дату.

In [4]:
df = df.replace(to_replace=['no', 'yes'], value=[0, 1]) # переводим yes/no в 1/0
df[['y', 'default', 'housing', 'loan']] = df[['y', 'default', 'housing', 'loan']].astype('int64')

df = df.replace(to_replace=['married', 'single', 'divorced'], value=[1, 0, 0]) # состоит ли в браке

df = df.replace(to_replace=['unknown','primary', 'secondary','tertiary'], value=[0, 1, 2, 3]) # уровень образования

df = df.replace(to_replace=['unemployed', 'housemaid', 'unknown', 'retired',
                            'student',
                            'management', 'technician', 'entrepreneur','blue-collar', 'admin.', 'services', 'self-employed'], 
                value=[0, 0, 0, 0, 1, 2 , 2, 2, 2, 2, 2, 2]) # работа

df = df.replace(to_replace=['unknown', 'failure', 'other', 'success'], value=[0, 0, 1, 1]) # результат предыдущей сделки

df = df.replace(to_replace=['unknown', 'cellular', 'telephone'], value=[0, 1, 1]) # 

df[['y', 'default', 'housing', 'loan', 'marital', 'education', 'job', 'poutcome']] = df[['y', 'default', 
                                        'housing', 'loan', 'marital', 'education', 'job', 'poutcome']].astype('int64')


df = df.drop(['day', 'month'], axis = 1)

df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,58,2,1,3,0,2143,1,0,0,261,1,-1,0,0,0
1,44,2,0,2,0,29,1,0,0,151,1,-1,0,0,0
2,33,2,1,2,0,2,1,1,0,76,1,-1,0,0,0
3,47,2,1,0,0,1506,1,0,0,92,1,-1,0,0,0
4,33,0,0,0,0,1,0,0,0,198,1,-1,0,0,0


В полученном датасете число значений "0" целевой переменной превышает число значений "1" почти в 8 раз. Таким образом, наблюдается дисбаланс классов.

In [5]:
df['y'].value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [6]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

Сбалансируем выборку, увеличив число строк со значением отклика 0.

In [7]:
num_0 = len(df[df['y'] == 0])
num_1 = len(df[df['y'] == 1])
print(num_0,num_1)

# оверсэмплинг
oversampled_data = pd.concat([ df[df['y'] == 0] , df[df['y'] == 1].sample(num_0, replace=True) ])
print(len(oversampled_data))

oversampled_data.head()

39922 5289
79844


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,58,2,1,3,0,2143,1,0,0,261,1,-1,0,0,0
1,44,2,0,2,0,29,1,0,0,151,1,-1,0,0,0
2,33,2,1,2,0,2,1,1,0,76,1,-1,0,0,0
3,47,2,1,0,0,1506,1,0,0,92,1,-1,0,0,0
4,33,0,0,0,0,1,0,0,0,198,1,-1,0,0,0


## XGBoost

In [8]:
X = df.iloc[:,:-1].values
y = df['y'].values


from sklearn.model_selection import train_test_split  # расщепим на тестовую и обучающую выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 88)

In [11]:
from xgboost import XGBClassifier
from sklearn import metrics
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [30]:
param_grid_xgboost = {  
    'n_estimators': [250, 500, 1000],
    'max_depth': [3, 4, 6],
    'min_child_weight': [2, 3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
}

model = XGBClassifier(seed=42,
                      n_estimators=100,
                      max_depth=6,
                      min_child_weight = 4,
                      learning_rate = 0.25,
                      gamma=0.001)
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0.001, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.25, max_delta_step=0,
              max_depth=6, min_child_weight=4, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

В качестве целевой метрики выберем **precision**: этот показатель можно проинтерпретировать как нацеленность на заинтересованных клиентов и избегание предложений незаинтересованной аудитории (они, вероятно, будут расценены как спам).

In [31]:
grid_CV_1 = GridSearchCV(
       estimator=model,
       param_grid=param_grid_xgboost,
       scoring='precision',
       cv=5,
       n_jobs=-1)


grid_CV_1.fit(X_train, y_train)





GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1,
                                     enable_categorical=False, gamma=0.001,
                                     gpu_id=-1, importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.25, max_delta_step=0,
                                     max_depth=6, min_child_weight=4,
                                     missing=nan, monotone_constraints='()',
                                     n_estimators=100, n_jobs=4,
                                     num_parallel_tree=1, predictor='auto',
                                     random_state=42, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=42, subsample=1,
                                     tree

In [32]:
grid_CV_1.best_params_

{'learning_rate': 0.01,
 'max_depth': 3,
 'min_child_weight': 9,
 'n_estimators': 250}

In [37]:
model = XGBClassifier(seed=42,
                      learning_rate = 0.01,
                      max_depth = 3,
                      min_child_weight = 9,
                      n_estimators = 250)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
print (metrics.classification_report(y_train, y_pred_train))

y_pred_test = model.predict(X_test)
print (metrics.classification_report(y_test, y_pred_test))



              precision    recall  f1-score   support

           0       0.90      0.99      0.94     31938
           1       0.68      0.21      0.33      4230

    accuracy                           0.90     36168
   macro avg       0.79      0.60      0.63     36168
weighted avg       0.88      0.90      0.87     36168

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7984
           1       0.65      0.22      0.33      1059

    accuracy                           0.90      9043
   macro avg       0.78      0.60      0.64      9043
weighted avg       0.88      0.90      0.87      9043



## Градиентный бустинг

In [40]:
from sklearn.ensemble import GradientBoostingClassifier

In [42]:
param_grid_gr_boost = {  
    'n_estimators': [250, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'min_samples_leaf': [2, 3, 4, 5, 6, 8],
    'min_samples_split': [2, 3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
}

model2 = GradientBoostingClassifier(random_state=88,
                                   max_features='sqrt', 
                                   n_estimators=500,
                                   loss='deviance', 
                                   learning_rate=0.01, 
                                   criterion='friedman_mse', 
                                   min_impurity_decrease=0.001, 
                                   min_samples_leaf=5, 
                                   min_samples_split=10,
                                   max_depth=5)
model2.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.01, max_depth=5, max_features='sqrt',
                           min_impurity_decrease=0.001, min_samples_leaf=5,
                           min_samples_split=10, n_estimators=500,
                           random_state=88)

In [49]:
grid_CV_2 = GridSearchCV(
       estimator=model2,
       param_grid=param_grid_gr_boost,
       scoring='precision',
       cv=5,
       n_jobs=-1)

grid_CV_2.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(learning_rate=0.01,
                                                  max_depth=5,
                                                  max_features='sqrt',
                                                  min_impurity_decrease=0.001,
                                                  min_samples_leaf=5,
                                                  min_samples_split=10,
                                                  n_estimators=500,
                                                  random_state=88),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 0.2, 0.5],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 8],
                         'min_samples_split': [2, 3, 4, 5, 6, 8],
                         'n_estimators': [250, 500, 1000]},
             scoring='precision')

In [50]:
grid_CV_2.best_params_

{'learning_rate': 0.01,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 250}

In [54]:
model2 = GradientBoostingClassifier(random_state=88,
                      learning_rate= 0.01,
                      min_samples_leaf= 2,
                      min_samples_split= 5,
                      n_estimators= 250)
model2.fit(X_train, y_train)

y_pred_train = model2.predict(X_train)
print (metrics.classification_report(y_train, y_pred_train))

y_pred_test = model2.predict(X_test)
print (metrics.classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95     31938
           1       0.68      0.27      0.39      4230

    accuracy                           0.90     36168
   macro avg       0.80      0.63      0.67     36168
weighted avg       0.88      0.90      0.88     36168

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7984
           1       0.64      0.26      0.37      1059

    accuracy                           0.90      9043
   macro avg       0.77      0.62      0.66      9043
weighted avg       0.88      0.90      0.88      9043



## Сравним с Random Forest

Изначально случайный лес обучался на датасете без балансировки классов и с меньшим количеством переменных. Обучим ещё раз на новом датасете.

In [9]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(random_state=88,
                               n_estimators=70,
                               criterion='gini',
                               max_depth=5,
                               oob_score=True,
                               warm_start=False,
                               class_weight=None,
                               min_samples_split=5
                              )
model_rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_split=5, n_estimators=70,
                       oob_score=True, random_state=88)

In [12]:
params_set_cart = {
'n_estimators': [100, 250, 500, 750, 900, 1000],
'criterion': ['gini', 'entropy'],
'max_depth': [2, 3, 4, 5, 6, 8],
'min_samples_split': [2, 3, 4, 5, 6, 8, 10]
}

grid_CV_RF = GridSearchCV(estimator=model_rf,
       param_grid=params_set_cart,
       scoring='precision',
       cv=5,
       n_jobs=-1)


grid_CV_RF.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_depth=5, min_samples_split=5,
                                              n_estimators=70, oob_score=True,
                                              random_state=88),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 8],
                         'min_samples_split': [2, 3, 4, 5, 6, 8, 10],
                         'n_estimators': [100, 250, 500, 750, 900, 1000]},
             scoring='precision')

In [13]:
grid_CV_RF.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_split': 5,
 'n_estimators': 100}

In [14]:
model_rf = RandomForestClassifier(random_state=88,
                               n_estimators=100,
                               criterion='entropy',
                               max_depth=4,
                               oob_score=True,
                               warm_start=False,
                               class_weight=None,
                               min_samples_split=5
                              )
model_rf.fit(X_train, y_train)

y_pred_train = grid_CV_RF.predict(X_train)  
y_pred_test = grid_CV_RF.predict(X_test)  

print(u'Обучающая выборка')
print(metrics.classification_report(y_train, y_pred_train)) 

print(u'Тестовая выборка')
print(metrics.classification_report(y_test, y_pred_test)) 

Обучающая выборка
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     31938
           1       0.79      0.03      0.06      4230

    accuracy                           0.89     36168
   macro avg       0.84      0.52      0.50     36168
weighted avg       0.87      0.89      0.84     36168

Тестовая выборка
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      7984
           1       0.87      0.03      0.06      1059

    accuracy                           0.89      9043
   macro avg       0.88      0.52      0.50      9043
weighted avg       0.88      0.89      0.84      9043



## Выводы 

Балансировка классов позволила увеличить среднее значение **precision** для случайного леса с 80 до 88 % на тестовой выборке. Также при обучении на сбалансированной выборке **precision** для каждого класса ближе друг к другу по значению.

Градиентный бустинг и XGBoost дают средний показатель **precision** на тестовой выборке на 10-11 % меньше, чем у случайного леса, в то же время **precision** для класса 0 и **accuracy** выше на несколько процентов.

При нацеленности на заинтересованных клиентов и избегании предложений незаинтересованной аудитории лучше выбрать случайный лес в качестве классификатора.