### CHURN 2

1. Для данных о клиентах банка построены разные модели: [Logress, XGB, RandomForestClassifier]
2. Отобрана лучшую модель по метрикам
3. Для отобранной модели сделана оценка экономической эффективности
4. Проведен подбор гиперпараметров лучшей модели по итогам

In [None]:
#!pip install xgboost

In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Загрузка и подготовка данных

In [None]:
df = pd.read_csv("/content/churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [None]:
df.drop(['CustomerId', 'RowNumber'], inplace=True, axis=1)

In [None]:
df['Exited'].value_counts(normalize=True)
# 4/1 => метрики precision/recall/fscore

0    0.7963
1    0.2037
Name: Exited, dtype: float64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Exited', axis=1), df['Exited'],
                                                    random_state=197)

In [None]:
df.tail(3)

Unnamed: 0,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9997,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


### Пайплайн

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [None]:
categorical_columns = ['Geography', 'Gender', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Tenure', 'HasCrCard', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [None]:
from sklearn.preprocessing import StandardScaler

final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                 ('standardscaler', StandardScaler() )
            ])
    final_transformers.append((cont_col, cont_transformer))


feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [None]:
rfc_pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state = 197)),
])

xgb_pipeline = Pipeline([
    ('features', feats),
    ('classifier', xgb.XGBClassifier()),
])

lr_pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state = 197)),
])

### Обучение

In [None]:
rfc_pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('IsActiveMember',
                                                 Pipeline(steps=[('selector',
   

In [None]:
xgb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('IsActiveMember',
                                                 Pipeline(steps=[('selector',
   

In [None]:
lr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('IsActiveMember',
                                                 Pipeline(steps=[('selector',
   

In [None]:
rfc_preds = rfc_pipeline.predict_proba(X_test)[:, 1]
print(rfc_preds[:5])

xgb_preds = xgb_pipeline.predict_proba(X_test)[:, 1]
print(xgb_preds[:5].round(2))

lr_preds = lr_pipeline.predict_proba(X_test)[:, 1]
print(lr_preds[:5].round(2))

[0.   0.67 0.01 0.93 0.02]
[0.01 0.66 0.02 0.79 0.02]
[0.03 0.5  0.06 0.42 0.04]


### Метрики

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [None]:
def make_metrics(preds):

  precision, recall, thresholds = precision_recall_curve(y_test, preds)
  fscore = (2 * precision * recall) / (precision + recall)

  ix = np.argmax(fscore)
  #print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], fscore[ix],
  #                                                                      precision[ix], recall[ix]))

  return [ thresholds[ix], fscore[ix], precision[ix], recall[ix] ]

In [None]:
results = {'RFC': make_metrics(rfc_preds), 
           'XGB': make_metrics(xgb_preds),  
           'LR': make_metrics(lr_preds)}

pd.DataFrame(results, index=['threshold', 'fscore', 'precision', 'recall'])

Unnamed: 0,RFC,XGB,LR
threshold,0.36,0.262445,0.30493
fscore,0.615385,0.62,0.487759
precision,0.627615,0.565506,0.458407
recall,0.603622,0.686117,0.521127


В задаче оттока максимизирую **precision**: перед тратами на взаимодействие с клиентом нужна большая уверенность в том, что он является таргетом, даже если не все уходящие клиенты будут находиться алгоритмом (recall).

### Бизнес-метрика

Можно еще посчитать прибыль с каждой модели:

условно, каждый возвращенный клиент приносит прибыль в 2 доллара США, каждый звонок клиенту стоит 1 доллар США

In [None]:
def count_profit(preds):

    conf_matrix = confusion_matrix(y_test, preds > make_metrics(preds)[0])

    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]
    TP = conf_matrix[1][1]

    profit = 2 * TP - 1 * np.sum(conf_matrix[:, 1])

    return profit

In [None]:
print(f'Random Forest Classifier: {count_profit(rfc_preds)} USD.')
print(f'XGBoost: {count_profit(xgb_preds)} USD.')
print(f'Logistic Regression: {count_profit(lr_preds)} USD.')

Random Forest Classifier: 127 USD.
XGBoost: 78 USD.
Logistic Regression: -48 USD.


В прибыли выводы такие же: логистическая регрессия будет тратить средства и нервы на звонки клиентам, а Случайный Лес усердно зарабатывает баксы.

### GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

params={'classifier__max_features':[0.3, 0.5, 0.7],
        'classifier__min_samples_leaf':[1, 2, 3],
        'classifier__max_depth':[5, 6, 7]
        }

In [None]:
grid = GridSearchCV(rfc_pipeline,
                    param_grid=params,
                    cv=6,
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_

{'classifier__max_depth': 7,
 'classifier__max_features': 0.7,
 'classifier__min_samples_leaf': 2}

In [None]:
rfc_upd_pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(max_depth=7, max_features=0.7, 
                                          min_samples_leaf=2, random_state=197)),
])
rfc_upd_pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('IsActiveMember',
                                                 Pipeline(steps=[('selector',
   

In [None]:
rfc_upd_preds = rfc_upd_pipeline.predict_proba(X_test)[:, 1]
print(make_metrics(rfc_upd_preds))

print(f'Random Forest Classifier updated: {count_profit(rfc_upd_preds)} USD.')

[0.2670323641571962, 0.614391143911439, 0.5672913117546848, 0.670020120724346]
Random Forest Classifier updated: 78 USD.


Ммм. Понятно, GridSearch нашел минимальную ошибку, но мне нужно максимизировать прибыль (или точность).

In [None]:
def max_profit(preds):
  profits = []

  thresholds = np.linspace(0, 1, 100)
  for i in thresholds:
    cnf_matrix = confusion_matrix(y_test, preds > i)
    profits.append(2 * cnf_matrix[1][1] - 1 * np.sum(cnf_matrix[:, 1]))
    ix = np.argmax(profits)
  
  return profits[ix]


print(f'At its best RandomForestClassifier makes {max_profit(rfc_upd_preds)} USD.')

At its best RandomForestClassifier makes 163 USD.


Вывод: меняя порог для классификации, можно добиться максимальной прибыли (а в бизнес-моделях это важнее f-score)