# Data Preprocessing

This notebook covers the data preprocessing steps for the Customer Churn Prediction project. We will load the split data, handle missing values, encode categorical variables, and scale numerical features.

In [None]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set pandas display options
pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
data_path = '../data/raw/splits'

train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
val_df = pd.read_csv(os.path.join(data_path, 'validation.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

print(f"Train shape: {train_df.shape}")
print(f"Val shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

## 2. Define Columns

In [None]:
categorical_cols = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]

numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

target = 'Churn'

## 3. Data Cleaning

Convert `TotalCharges` to numeric, coercing errors to NaN.

In [None]:
train_df['TotalCharges'] = pd.to_numeric(train_df['TotalCharges'], errors='coerce')
val_df['TotalCharges'] = pd.to_numeric(val_df['TotalCharges'], errors='coerce')
test_df['TotalCharges'] = pd.to_numeric(test_df['TotalCharges'], errors='coerce')

print("Train Nulls after conversion:")
print(train_df[numerical_cols].isnull().sum())

## 4. Preprocessing Pipeline

We will use `ColumnTransformer` to apply different transformations to numerical and categorical columns.
We include `SimpleImputer` to handle missing values (like those introduced in `TotalCharges`).

- **Numerical Columns**: `SimpleImputer(median)` -> `StandardScaler`
- **Categorical Columns**: `SimpleImputer(most_frequent)` -> `OneHotEncoder`

In [None]:
# Define transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

# Fit the preprocessor on the training data ONLY
preprocessor.fit(train_df)

## 5. Transform Data

Transform both training and validation sets. **Test set is left untouched.**

In [None]:
# Transform train and validation data
X_train_transformed = preprocessor.transform(train_df)
X_val_transformed = preprocessor.transform(val_df)

# Get feature names
onehot_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
all_features = numerical_cols + list(onehot_features)

# Convert back to DataFrame
X_train_processed = pd.DataFrame(X_train_transformed, columns=all_features)
X_val_processed = pd.DataFrame(X_val_transformed, columns=all_features)

print(f"Processed Train shape: {X_train_processed.shape}")
print(f"Processed Val shape: {X_val_processed.shape}")

## 6. Target Encoding

Encode the target variable `Churn`.

In [None]:
le = LabelEncoder()

# Fit on train and transform both train and val
y_train_encoded = le.fit_transform(train_df[target])
y_val_encoded = le.transform(val_df[target])

# Add target back to processed dataframes (optional)
X_train_processed[target] = y_train_encoded
X_val_processed[target] = y_val_encoded

print("Target Class Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
print(f"Processed Train with Target shape: {X_train_processed.shape}")

## 7. Save Processed Data

Save the processed datasets to `data/processed` and the preprocessor to `models`.

In [None]:
processed_path = '../data/processed'
models_path = '../models'
os.makedirs(processed_path, exist_ok=True)
os.makedirs(models_path, exist_ok=True)

# Save processed data
X_train_processed.to_csv(os.path.join(processed_path, 'train_processed.csv'), index=False)
X_val_processed.to_csv(os.path.join(processed_path, 'val_processed.csv'), index=False)

# Save the preprocessor and label encoder
joblib.dump(preprocessor, os.path.join(models_path, 'preprocessor.joblib'))
joblib.dump(le, os.path.join(models_path, 'label_encoder.joblib'))

print("Data and models saved successfully.")