# Feature Engineering for Telco Customer Churn

This notebook demonstrates and documents feature engineering steps for the Telco Customer Churn dataset, including encoding, transformations, and new feature creation.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from src.preprocess.dataloader import load_telco_data

In [None]:
# Load Data
train_df, test_df, val_df = load_telco_data()
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape if val_df is not None else 'N/A'}")
print(f"Test shape: {test_df.shape}")

## Feature Engineering Steps

- Drop or encode the 'customerID' column
- Convert 'TotalCharges' to numeric and handle errors
- Encode categorical variables (label encoding for binary, one-hot for multi-class)
- Create new features (e.g., tenure groups)
- Document all changes and rationale

In [None]:
# Drop or encode 'customerID' and convert 'TotalCharges' to numeric
def preprocess_basic(df):
    df = df.copy()
    if 'customerID' in df.columns:
        df = df.drop('customerID', axis=1)
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    return df

train_df = preprocess_basic(train_df)
val_df = preprocess_basic(val_df) if val_df is not None else None
test_df = preprocess_basic(test_df)

print('customerID dropped and TotalCharges converted to numeric.')

In [None]:
# Encode categorical variables
def encode_categoricals(df):
    df = df.copy()
    for col in df.select_dtypes(include=['object', 'category']).columns:
        if df[col].nunique() == 2:
            # Binary: label encoding
            df[col] = df[col].astype('category').cat.codes
        else:
            # Multi-class: one-hot encoding
            dummies = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df.drop(col, axis=1), dummies], axis=1)
    return df

train_df = encode_categoricals(train_df)
val_df = encode_categoricals(val_df) if val_df is not None else None
test_df = encode_categoricals(test_df)

print('Categorical variables encoded.')

In [None]:
# Create new features (e.g., tenure groups)
def add_features(df):
    df = df.copy()
    if 'tenure' in df.columns:
        df['tenure_group'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 60, np.inf], labels=['0-12', '13-24', '25-48', '49-60', '61+'])
        df = pd.concat([df, pd.get_dummies(df['tenure_group'], prefix='tenure_group')], axis=1)
        df = df.drop('tenure_group', axis=1)
    return df

train_df = add_features(train_df)
val_df = add_features(val_df) if val_df is not None else None
test_df = add_features(test_df)

print('New features created.')

In [None]:
# Display processed train set info
print('Processed train_df info:')
print(train_df.info())
display(train_df.head())