### Импорт необходимых библиотек

In [1]:
ыimport pandas as pd # Для чтения датафрейма
from sklearn.preprocessing import LabelEncoder # для кодирования переменных
from sklearn.model_selection import train_test_split # для разделения на трейн и тест
from sklearn.utils import resample # Для апсемплинга
from sklearn.ensemble import AdaBoostClassifier # по заданию
from sklearn.ensemble import GradientBoostingClassifier  # по заданию
from sklearn.metrics import classification_report # для вывода результата работы модели
import numpy as np
from sklearn.model_selection import GridSearchCV

### Чтение данных

In [2]:
data = pd.read_csv('BankChurners.csv')

In [3]:
### Удалим признаки, которые были отмечены создателем датасета на каггле
data = data.drop(columns=data.columns[-2:])

In [4]:
### также для моделирования не нужен номер клиента
data = data.drop(columns='CLIENTNUM')

### Кодирование переменных
Не имеет смысла еще как-то предобрабатывать данные для модели бустинга, так как в данных нет пропусков, а все численные значения обработают деревья в алгоритмах бустинга

In [5]:
data.isna().any()

Attrition_Flag              False
Customer_Age                False
Gender                      False
Dependent_count             False
Education_Level             False
Marital_Status              False
Income_Category             False
Card_Category               False
Months_on_book              False
Total_Relationship_Count    False
Months_Inactive_12_mon      False
Contacts_Count_12_mon       False
Credit_Limit                False
Total_Revolving_Bal         False
Avg_Open_To_Buy             False
Total_Amt_Chng_Q4_Q1        False
Total_Trans_Amt             False
Total_Trans_Ct              False
Total_Ct_Chng_Q4_Q1         False
Avg_Utilization_Ratio       False
dtype: bool

In [6]:
le = LabelEncoder()

In [7]:
columns_to_transform = data.columns[data.dtypes=='object'].tolist()

In [8]:
for i in columns_to_transform:
    data[i] = le.fit_transform(data[i])

### Деление на трейн и тест

In [9]:
data_train, data_test = train_test_split(data, test_size=0.25)

### Апсемплинг

In [10]:
positive_samples = data_train[data_train.Attrition_Flag==1]
negative_samples = data_train[data_train.Attrition_Flag==0]

In [11]:
positive_samples.shape, negative_samples.shape

((6373, 20), (1222, 20))

In [12]:
negative_samples = resample(negative_samples, n_samples=len(positive_samples))

In [13]:
data_train = pd.concat((positive_samples, negative_samples)).sample(frac=1)

### Финальная обработка выборок

In [14]:
X_train = data_train.drop(columns='Attrition_Flag')
y_train = data_train.Attrition_Flag

X_test = data_test.drop(columns='Attrition_Flag')
y_test = data_test.Attrition_Flag

In [28]:
adaboost_parametrs = {
    'n_estimators': (50, 100, 150),
    'learning_rate': np.linspace(0.01, 0.5, 10)
}
grad_params = {
    'learning_rate': np.linspace(0.01, 0.5, 10),
    'n_estimators': (50, 100, 150),
    'max_depth': (2, 4, 6, 8)
}

In [23]:
adaboost = AdaBoostClassifier()
gradient = GradientBoostingClassifier()

### Построение предсказаний
Используется перебор гиперпараметров моделей из задания

In [26]:
grid_ada = GridSearchCV(adaboost, adaboost_parametrs, verbose=100, n_jobs=2, cv=3)
grid_ada.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


GridSearchCV(cv=3, estimator=AdaBoostClassifier(), n_jobs=2,
             param_grid={'learning_rate': array([0.01      , 0.06444444, 0.11888889, 0.17333333, 0.22777778,
       0.28222222, 0.33666667, 0.39111111, 0.44555556, 0.5       ]),
                         'n_estimators': (50, 100, 150)},
             verbose=100)

In [30]:
grid_grad = GridSearchCV(gradient, grad_params, verbose=100, n_jobs=2, cv=3)
grid_grad.fit(X_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV 2/3; 1/30] START learning_rate=0.01, n_estimators=50........................
[CV 2/3; 1/30] END learning_rate=0.01, n_estimators=50;, score=0.769 total time=   0.7s
[CV 3/3; 1/30] START learning_rate=0.01, n_estimators=50........................
[CV 3/3; 1/30] END learning_rate=0.01, n_estimators=50;, score=0.784 total time=   0.5s
[CV 2/3; 2/30] START learning_rate=0.01, n_estimators=100.......................
[CV 2/3; 2/30] END learning_rate=0.01, n_estimators=100;, score=0.779 total time=   1.1s
[CV 1/3; 3/30] START learning_rate=0.01, n_estimators=150.......................
[CV 1/3; 3/30] END learning_rate=0.01, n_estimators=150;, score=0.785 total time=   1.4s
[CV 3/3; 3/30] START learning_rate=0.01, n_estimators=150.......................
[CV 3/3; 3/30] END learning_rate=0.01, n_estimators=150;, score=0.797 total time=   1.6s
[CV 3/3; 4/30] START learning_rate=0.06444444444444444, n_estimators=50.........
[CV 3/3;

GridSearchCV(cv=3, estimator=GradientBoostingClassifier(), n_jobs=2,
             param_grid={'learning_rate': array([0.01      , 0.06444444, 0.11888889, 0.17333333, 0.22777778,
       0.28222222, 0.33666667, 0.39111111, 0.44555556, 0.5       ]),
                         'max_depth': (2, 4, 6, 8),
                         'n_estimators': (50, 100, 150)},
             verbose=100)

In [19]:
adaboost.fit(X_train, y_train)
gradient.fit(X_train, y_train)

GradientBoostingClassifier()

In [27]:
print(classification_report(y_test, grid_ada.predict(X_test)))

              precision    recall  f1-score   support

           0       0.77      0.95      0.85       405
           1       0.99      0.95      0.97      2127

    accuracy                           0.95      2532
   macro avg       0.88      0.95      0.91      2532
weighted avg       0.95      0.95      0.95      2532



In [31]:
print(classification_report(y_test, grid_grad.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       405
           1       0.98      0.99      0.98      2127

    accuracy                           0.97      2532
   macro avg       0.95      0.94      0.94      2532
weighted avg       0.97      0.97      0.97      2532



### Градиентный бустинг показал себя лучше