# Minimal Pre-Processing

## Imports and Useful Paths

In [1]:
# Helper libraries
import warnings

# Scientific and visual libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
np.set_printoptions(precision=4)
sns.set_theme()
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

## Loading Datasets

In [2]:
from churn_detection.paths import TRANSFORMED_DATA_DIR

In [3]:
train_df = pd.read_feather(TRANSFORMED_DATA_DIR / "train.feather")
val_df = pd.read_feather(TRANSFORMED_DATA_DIR / "validation.feather")

In [4]:
numeric_variables = [
    "tenure", 
    "monthlycharges", 
    "totalcharges"
]

categorical_variables = [
    var for var in train_df.columns
    if var not in numeric_variables
    and var != "churn"
]

## Experiment on Categorical Features

The experiment will explore the minimal data preprocessing needed to achieve promising result: building a working classifier. What necessary data preparation do we need here? Essentially, encoding of categorical variables, it's the only one vital thing to do first here. Numeric data transforms would tweak the model too early.

### Label Encoding

We will use the simplest category encoding technique.

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
encoder = LabelEncoder()
encoder.fit(train_df.gender)
encoder.transform(train_df.gender)

array([1, 0, 1, ..., 1, 0, 0])

In [7]:
encoder.classes_

array(['female', 'male'], dtype=object)

### Feature Preparation Pipeline

In [15]:
from churn_detection import features

Let's build custom transformers for the simplest baseline classifier

In [9]:
transformers = []
transformers.append(
    features.add_transformation(
        name="continuous", 
        variables=numeric_variables,
        steps=[
            ("dummy_num", features.DummyNumericTransformer()),
        ]
    )
)
transformers.append(
    features.add_transformation(
        name="discrete", 
        variables=categorical_variables,
        steps=[
            ("dummy_cat", features.SimpleCategoryEncoder()),
        ]
    )
)

In [10]:
preprocessor = features.create_column_preprocessor(transformers)

### Integration of Final Classifier

In [14]:
from sklearn.linear_model import LogisticRegression

In [16]:
X, y = train_df.drop(columns="churn"), train_df.churn

model = features.create_pipe(
    preprocessor, LogisticRegression()
)

model.fit(X, y)