# Feature Engineering for Churn Detection

## Imports and Settings

In [1]:
# Helper libraries
import warnings
from itertools import combinations

# Scientific libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import recall_score

# Local Imports
from churn_detection.utils import get_feature_names
from churn_detection.data import load_data
from churn_detection.visualization import plot_confusion_table, plot_pie
from churn_detection.preprocessing import preprocess_data, split_data
from churn_detection.evaluation import display_clf_report, validate_model_with_cv
from churn_detection.features import ColumnPreprocessor, Transformation, engineer_features, create_pipe


%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
np.set_printoptions(precision=4)
sns.set_theme()
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

## Loading and Preparing Data

In [2]:
churn = load_data()

In [3]:
preprocessed_data = preprocess_data(churn)
numeric_variables, categorical_variables = get_feature_names(preprocessed_data)

## Feature Engineering Strategies

### Interaction Features

**Category Concatenation**:<br>
Let's concatenate two categorical features into a single categorical feature that represents all unique combinations of these two.<br>
This kind of strategy can lead to coincidental patterns, but it's a good idea to test it because it's easy to implement.

In [4]:
from churn_detection.features import FeatureConcatenator

In [5]:
# Example with multiplelines and paymentmethod
feature_pairs = [("multiplelines", "paymentmethod")]
concat_transformer = FeatureConcatenator(feature_pairs=feature_pairs)

In [6]:
new_preprocessed_data = concat_transformer.transform(preprocessed_data)

Since this situation leads to an increase in the number of categories as well as rare categories, it makes sense to handle them in the feature transformation pipeline.

In [7]:
from churn_detection.features import RareCategoryEncoder

In [8]:
discrete_vars_pairs = list(combinations(categorical_variables, 2))
concat_transformer = FeatureConcatenator(feature_pairs=discrete_vars_pairs)
new_preprocessed_data = concat_transformer.transform(preprocessed_data)

In [9]:
multimodal_set = numeric_variables[:-1] 
skewed_set = ["totalcharges"] 
discrete_set = categorical_variables + concat_transformer.get_new_feature_names()

In [10]:
DISCRETE_FEATURES = {
    "name": "discrete", "variables": discrete_set,
    "steps": [
        ("rarecat", RareCategoryEncoder(tol=0.2)),
        ("catencoder", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))
    ],
}
MULTIMODAL_FEATURES = {
    "name": "multimod", "variables": multimodal_set,
    "steps": [("stdscaler", StandardScaler())],
}
SKEWED_FEATURES = {
    "name": "skewed", "variables": skewed_set,
    "steps": [("robustscaler", RobustScaler(quantile_range=(40, 60)))],
}

pipeline = ColumnPreprocessor()
pipeline.add_transformation(
    Transformation(**MULTIMODAL_FEATURES)
)
pipeline.add_transformation(
    Transformation(**SKEWED_FEATURES)
)
pipeline.add_transformation(
    Transformation(**DISCRETE_FEATURES)
)

prep_train, prep_test = train_test_split(
        new_preprocessed_data, 
        test_size=0.2, 
        random_state=1
)

In [11]:
X_train, y_train = split_data(prep_train)

reduce_prep_pipe = Pipeline(steps=[
    ('processor', pipeline.create_preprocessor()),
    ('reducer', SelectKBest(k="all", score_func=f_classif)),     
])

In [12]:
# Resulting feature space:
reduce_prep_pipe.fit_transform(X_train, y_train).shape

(5634, 256)