# Preprocessing steps - Customer Churn

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
cust_df = pd.read_csv("data/synth_customer_churn.csv")

In [None]:
y = cust_df['ChurnCategory'].map({'Low Risk': 0, 
                                  'Medium Risk': 1, 
                                  'High Risk': 2})


In [None]:
X = cust_df.drop(columns=['ChurnCategory', 
                          'CustomerID'])

## Create numerical pipeline options

In [None]:
minmax_median_cols = ['Age']
standard_mean_cols = ['MonthlyCharges']
standard_median_cols = ['CustomerSupportCalls']
robust_median_cols = ['ServiceUsage']


In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder


### Numerical pipelines

In [None]:
minmax_median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [None]:
standard_mean_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
standard_median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
robust_median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

### Categorical pipeline

In [None]:
categorical_cols = ['Gender', 'ContractType', 'PaymentMethod']

In [None]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', 
                              handle_unknown='ignore'))
])

### Custom Transformer for Tenure with `TransformerMixin`

In [None]:
import matplotlib.pyplot as plt
cust_df['Tenure'].hist(bins=30, 
                       edgecolor='black', 
                       color='grey')
plt.title("Distribution of Tenure")
plt.xlabel("Tenure (months)")
plt.ylabel("Frequency")
plt.grid(False)
plt.show()


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class TenureBinner(BaseEstimator, TransformerMixin):
    def __init__(self, column='Tenure', bins=None, labels=None):
        self.column = column
        self.bins = bins if bins is not None else [0, 6, 
                                                   15, 22, 
                                                   float('inf')]
        
        self.labels = labels if labels is not None else ['New', 
                                                         'Developing', 
                                                         'Established', 
                                                         'Loyal']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = pd.DataFrame(X).copy()
        if self.column not in X_.columns:
            raise ValueError(f"Column '{self.column}' not found in input DataFrame.")

        if X_[self.column].isnull().any():
            X_[self.column] = X_[self.column].fillna(X_[self.column].median())

        binned = pd.cut(X_[self.column], bins=self.bins, 
                        labels=self.labels, 
                        include_lowest=True)
        return pd.DataFrame({f"{self.column}_bin": binned})

    def get_feature_names_out(self, input_features=None):
        return [f"{self.column}_bin"]


In [None]:
%%writefile pipeline/binner.py
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
class TenureBinner(BaseEstimator, TransformerMixin):
    def __init__(self, column='Tenure', bins=None, labels=None):
        self.column = column
        self.bins = bins if bins is not None else [0, 6, 15, 22, float('inf')]
        self.labels = labels if labels is not None else ['New', 'Developing', 'Established', 'Loyal']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = pd.DataFrame(X).copy()
        if self.column not in X_.columns:
            raise ValueError(f"Column '{self.column}' not found in input DataFrame.")

        if X_[self.column].isnull().any():
            X_[self.column] = X_[self.column].fillna(X_[self.column].median())

        binned = pd.cut(X_[self.column], bins=self.bins, labels=self.labels, include_lowest=True)
        return pd.DataFrame({f"{self.column}_bin": binned})

    def get_feature_names_out(self, input_features=None):
        return [f"{self.column}_bin"]

In [None]:
tenure_binner_pipeline = Pipeline([
    ('tenure_binner', TenureBinner(
        column='Tenure',
        bins=[0, 6, 15, 22, float('inf')],
        labels=['New', 'Developing', 
                'Established', 'Loyal']
    )),
    ('encoder', OneHotEncoder())
])


## Combine structured and custom into one pipeline

In [None]:
structured_preprocessor = ColumnTransformer(transformers=[
    ('minmax_median', minmax_median_pipeline, minmax_median_cols),
    ('standard_mean', standard_mean_pipeline, standard_mean_cols),
    ('standard_median', standard_median_pipeline, standard_median_cols),
    ('robust_median', robust_median_pipeline, robust_median_cols),
    ('categorical', categorical_pipeline, categorical_cols)
])


In [None]:
structured_preprocessor

In [None]:
from sklearn.pipeline import FeatureUnion
full_preprocessor = FeatureUnion([
    ('structured', structured_preprocessor),
    ('tenure_bins', tenure_binner_pipeline)
])
full_preprocessor

## Save the preprocessor

In [None]:
import os 
PREPROC_DIR = 'models/preprocessing'
os.makedirs(PREPROC_DIR, exist_ok=True)

In [None]:
import joblib
joblib.dump(full_preprocessor,
            os.path.join(PREPROC_DIR, 'preprocessor.joblib'))