# HW_Lesson_5

**1). Для нашего пайплайна (Case1) поэкспериментировать с разными моделями:**
- бустинг
- логистическая регрессия

In [126]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import lightgbm
from xgboost import XGBRFClassifier
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

In [127]:
df = pd.read_csv('Churn_Modelling.csv')

In [128]:
df.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [129]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [130]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Exited'),
                                                   df['Exited'], test_size=0.25,
                                                   random_state=15, stratify=df['Exited'])

In [131]:
# собираем pipeline
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [132]:
df.sample(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
893,894,15772781,Ball,703,France,Female,51,3,0.0,3,1,1,77294.56,1
5013,5014,15626795,Gorman,672,France,Female,40,3,0.0,1,1,0,113171.61,1


In [133]:
# Разделение признаков
cat_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [134]:
final_transformers = list()

for cat_col in cat_columns:
    cat_transformer = Pipeline([
        ('selector', FeatureSelector(column=cat_col)),
        ('ohe', OHEEncoder(key=cat_col))
    ])
    
    final_transformers.append((cat_col, cat_transformer))

for cont_col in continuous_columns:
    cont_transformer = Pipeline([
        ('selector', NumberSelector(key=cont_col))
                                ])
    final_transformers.append((cont_col, cont_transformer))

In [135]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

In [136]:
# функция нахождения оптимального порога для модели

def pipeline_classifier(model):
    pipeline = Pipeline([
        ('features', feats),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict_proba(X_test)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.nanargmax(fscore)
    print(f'Best threshold={thresholds[ix]}, F-score = {fscore[ix]:.3f}, \
    Precision = {precision[ix]:.3f}, Recall = {recall[ix]:.3f}')
    
    return preds, thresholds[ix]

*RandomForestClassifier*

In [137]:
pipeline_classifier(RandomForestClassifier(random_state=14))

Best threshold=0.4, F-score = 0.631,     Precision = 0.705, Recall = 0.572


(array([0.22, 0.04, 0.01, ..., 0.41, 0.01, 0.  ]), 0.4)

*LogisticRegression*

In [138]:
pipeline_classifier(LogisticRegression(random_state=14))

Best threshold=0.2275352162900428, F-score = 0.406,     Precision = 0.319, Recall = 0.558


(array([0.20831243, 0.1212888 , 0.0730186 , ..., 0.16394312, 0.12985877,
        0.06791397]),
 0.2275352162900428)

*Lightgbm*

In [139]:
pipeline_classifier(lightgbm.LGBMClassifier(random_state=14))

Best threshold=0.3300176801787979, F-score = 0.622,     Precision = 0.635, Recall = 0.609


(array([0.05909078, 0.04439657, 0.00399244, ..., 0.4094348 , 0.02245124,
        0.02494585]),
 0.3300176801787979)

*XGBoost*

In [140]:
pipeline_classifier(XGBRFClassifier(random_state=14))

Best threshold=0.34546005725860596, F-score = 0.623,     Precision = 0.678, Recall = 0.576


(array([0.15745042, 0.14073116, 0.12587737, ..., 0.47954518, 0.12617415,
        0.12587737], dtype=float32),
 0.34546006)

*Catboost*

In [141]:
pipeline_classifier(CatBoostClassifier(silent=True, random_state=14))

Best threshold=0.4438907438739124, F-score = 0.627,     Precision = 0.735, Recall = 0.546


(array([0.04141454, 0.05125524, 0.00458215, ..., 0.41390666, 0.01326525,
        0.01251828]),
 0.4438907438739124)

**2). Отобрать лучшую модель по метрикам (какая по вашему мнению здесь наиболее подходящая ML-метрика)**

Наиболее подходящей метрикой здесь будет являться Precision(точность), так как важнее не нести затрат на людей, которые не собираются уходить.

**3). Для отобранной модели (на отложенной выборке) сделать оценку экономической эффективности при тех же вводных, как в вопросе 2:**
   - 1 доллар на удержание
   - 2 доллара - с каждого правильно классифицированного (True Positive)

In [142]:
preds, threshold = pipeline_classifier(CatBoostClassifier(silent=True, random_state=14))

Best threshold=0.4438907438739124, F-score = 0.627,     Precision = 0.735, Recall = 0.546


In [143]:
cnf_matrix = confusion_matrix(y_test, preds > threshold)

In [144]:
TN = cnf_matrix[0, 0]
FN = cnf_matrix[1, 0]
FP = cnf_matrix[0, 1]
TP = cnf_matrix[1, 1]

In [145]:
summ = TP * 2 - (TP + FP) * 1
print(f'Дополнительный доход: {summ}')

Дополнительный доход: 177


In [146]:
def get_rubles(y_test, probs):
    
    add_summ = []
    thresholds = np.linspace(0, 1, 100)
    for i in thresholds:
        cnf_matrix = confusion_matrix(y_test, probs > i)
        add_summ.append(cnf_matrix[1,1] - cnf_matrix[0,1])
        
    # locate the index of the largest f score
    ix = np.argmax(add_summ)
    print('Best Threshold=%.3f, summ=%.3f'%(thresholds[ix], add_summ[ix]))
    return thresholds[ix]

In [147]:
get_rubles(y_test, preds)

Best Threshold=0.495, summ=180.000


0.494949494949495

*Доп.доход = 180 $*

**4). *Провести подбор гиперпараметров лучшей модели по итогам 2-3**

Поменяем в функции пайплайна максимизацию f-score на precision

In [148]:
# from sklearn.model_selection import GridSearchCV

In [149]:
# params = {
#     'classifier__iterations': [100, 300, 600, 1000],
#     'classifier__learning_rate': [1, 0.1, 0.01, 0.001],
#     'classifier__l2_leaf_reg': [10, 1, 0.1, 0.01, 0.001],
#     'classifier__auto_class_weights': ['None', 'Balanced', 'SqrtBalanced']
# }

In [150]:
# pipeline = Pipeline([
#         ('features', feats),
#         ('classifier', CatBoostClassifier(verbose=True, random_state=14,
#                                           early_stopping_rounds=100, task_type='GPU'))
#     ])

In [151]:
# grid = GridSearchCV(pipeline, param_grid=params, cv=6, refit=False)

In [152]:
# %%time
# search = grid.fit(X_train, y_train)

In [153]:
# search.best_params_

In [154]:
# search.best_score_

In [155]:
preds, threshold = pipeline_classifier(CatBoostClassifier(silent=True, random_state=14,
                                       early_stopping_rounds=100, iterations=1000,
                                       l2_leaf_reg=10, learning_rate=0.01))

Best threshold=0.35065000773392263, F-score = 0.641,     Precision = 0.684, Recall = 0.603


In [156]:
cnf_matrix = confusion_matrix(y_test, preds > threshold)
TN = cnf_matrix[0, 0]
FN = cnf_matrix[1, 0]
FP = cnf_matrix[0, 1]
TP = cnf_matrix[1, 1]
summ = TP * 2 - (TP + FP) * 1
print(f'Дополнительный доход: {summ}')

Дополнительный доход: 164


**5). *Еще раз провести оценку экономической эффективности**

In [158]:
get_rubles(y_test, preds)

Best Threshold=0.505, summ=186.000


0.5050505050505051

При применении данной модели с выбором отсечки при самом выгодном доходе получается 186 $ дополнительного дохода.