# Telco Churn — Feature Engineering & Preprocessing

Goal:
Build a reproducible preprocessing pipeline (cleaning, feature creation, encoding, scaling) ready for modeling and MLOps deployment.


In [1]:
# Import libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [5]:
import os
os.getcwd()


'C:\\Users\\Anna\\PycharmProjects\\churn-mlops-telco\\notebooks'

In [6]:
## Load data

from pathlib import Path

BASE_DIR = Path.cwd().parent   # -> churn-mlops-telco
DATA_PATH = BASE_DIR / "data" / "raw" / "WA_Fn-UseC_-Telco-Customer-Churn.csv"

df = pd.read_csv(DATA_PATH)
df.head()



Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
## Sanity check

print("Shape:", df.shape)

print("\nTarget distribution (raw):")
print(df["Churn"].value_counts())
print(df["Churn"].value_counts(normalize=True).round(3))

print("\nMissing values (top 10):")
print(df.isna().sum().sort_values(ascending=False).head(10))


Shape: (7043, 21)

Target distribution (raw):
Churn
No     5174
Yes    1869
Name: count, dtype: int64
Churn
No     0.735
Yes    0.265
Name: proportion, dtype: float64

Missing values (top 10):
customerID          0
DeviceProtection    0
TotalCharges        0
MonthlyCharges      0
PaymentMethod       0
PaperlessBilling    0
Contract            0
StreamingMovies     0
StreamingTV         0
TechSupport         0
dtype: int64


In [8]:
## Minimal cleaning

df = df.copy()

# Convert TotalCharges to numeric (handles blanks)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop non-informative identifier
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])

df[["tenure", "MonthlyCharges", "TotalCharges"]].describe()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7032.0
mean,32.371149,64.761692,2283.300441
std,24.559481,30.090047,2266.771362
min,0.0,18.25,18.8
25%,9.0,35.5,401.45
50%,29.0,70.35,1397.475
75%,55.0,89.85,3794.7375
max,72.0,118.75,8684.8


In [9]:
# Target encoding

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

df["Churn"].value_counts(), df["Churn"].mean().round(3)


(Churn
 0    5174
 1    1869
 Name: count, dtype: int64,
 0.265)

In [10]:
#Feature engineering

df = df.copy()

# Customer value proxy
df["customer_value"] = df["MonthlyCharges"] * df["tenure"]

# High monthly charges flag (median-based)
median_mc = df["MonthlyCharges"].median()
df["high_monthly_charges"] = (df["MonthlyCharges"] > median_mc).astype(int)

df[["tenure", "MonthlyCharges", "customer_value", "high_monthly_charges"]].head()


Unnamed: 0,tenure,MonthlyCharges,customer_value,high_monthly_charges
0,1,29.85,29.85,0
1,34,56.95,1936.3,0
2,2,53.85,107.7,0
3,45,42.3,1903.5,0
4,2,70.7,141.4,1


In [12]:
# Séparation train/val/test

from sklearn.model_selection import train_test_split

X = df.drop(columns=["Churn"])
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

print("Churn rate train:", round(y_train.mean(), 3))
print("Churn rate test :", round(y_test.mean(), 3))


Train shape: (5634, 21)
Test shape: (1409, 21)
Churn rate train: 0.265
Churn rate test : 0.265


In [16]:
# Category and numerical features

num_features = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_features = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("Numeric features:", num_features)
print("\nCategorical features (count =", len(cat_features), "):")
print(cat_features)



Numeric features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'customer_value', 'high_monthly_charges']

Categorical features (count = 15 ):
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [17]:
# Preprocessing pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)


In [18]:
# Fit and transform training data

X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

print("X_train_pre shape:", X_train_pre.shape)
print("X_test_pre shape :", X_test_pre.shape)


X_train_pre shape: (5634, 47)
X_test_pre shape : (1409, 47)


In [20]:
#Récupération des noms de features après encodage

# OneHotEncoder inside the ColumnTransformer
ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
ohe_feature_names = ohe.get_feature_names_out(cat_features)

all_feature_names = np.concatenate([num_features, ohe_feature_names])

print("Total engineered features:", len(all_feature_names))
print("First 30 feature names:\n", all_feature_names[:30])


Total engineered features: 47
First 30 feature names:
 ['SeniorCitizen' 'tenure' 'MonthlyCharges' 'TotalCharges' 'customer_value'
 'high_monthly_charges' 'gender_Female' 'gender_Male' 'Partner_No'
 'Partner_Yes' 'Dependents_No' 'Dependents_Yes' 'PhoneService_No'
 'PhoneService_Yes' 'MultipleLines_No' 'MultipleLines_No phone service'
 'MultipleLines_Yes' 'InternetService_DSL' 'InternetService_Fiber optic'
 'InternetService_No' 'OnlineSecurity_No'
 'OnlineSecurity_No internet service' 'OnlineSecurity_Yes'
 'OnlineBackup_No' 'OnlineBackup_No internet service' 'OnlineBackup_Yes'
 'DeviceProtection_No' 'DeviceProtection_No internet service'
 'DeviceProtection_Yes' 'TechSupport_No']


In [21]:
#Final check

print("Any NaN in X_train_pre?", np.isnan(X_train_pre).any())
print("Any NaN in X_test_pre?", np.isnan(X_test_pre).any())


Any NaN in X_train_pre? False
Any NaN in X_test_pre? False


In [22]:
# Save preprocessor for MLOps deployment

from pathlib import Path
import joblib

Path("../models").mkdir(parents=True, exist_ok=True)

joblib.dump(preprocessor, "../../models/preprocessor.joblib")
print("✅ Saved: models/preprocessor.joblib")


✅ Saved: models/preprocessor.joblib


In [23]:
# Save split data for modeling notebook

Path("data/processed").mkdir(parents=True, exist_ok=True)

train_df = X_train.copy()
train_df["Churn"] = y_train.values

test_df = X_test.copy()
test_df["Churn"] = y_test.values

train_df.to_csv("data/processed/telco_train.csv", index=False)
test_df.to_csv("data/processed/telco_test.csv", index=False)

print("✅ Saved: data/processed/telco_train.csv and telco_test.csv")


✅ Saved: data/processed/telco_train.csv and telco_test.csv
