In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle

## 1. Load and Inspect Data

In [None]:
# Load data
df = pd.read_csv('Telco-Customer-Churn.csv')
print('Dataset shape:', df.shape)
print('\nFirst few rows:')
print(df.head())
print('\nData types:')
print(df.dtypes)
print('\nMissing values:')
print(df.isnull().sum())

## 2. Clean and Handle Missing Values

In [None]:
# Convert TotalCharges to numeric (dataset may have non-numeric values)
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows with missing values created during conversion
df = df.dropna().reset_index(drop=True)

# Drop customerID (identifier, not useful for modeling)
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])

print('After cleaning shape:', df.shape)
print('Missing values after cleaning:')
print(df.isnull().sum().sum(), 'total missing values')

## 3. Encode Target Variable

In [None]:
# Encode Churn: Yes -> 1, No -> 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print('Target variable (Churn) distribution:')
print(df['Churn'].value_counts())
print('\nChurn rate:', round(df['Churn'].mean(), 4))

## 4. Identify Feature Types

In [None]:
# Separate features from target
y = df['Churn']
X = df.drop(columns=['Churn'])

# Identify numeric and categorical columns
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [c for c in X.columns if c not in numeric_cols]

print('Numeric columns:', len(numeric_cols))
print(numeric_cols)
print('\nCategorical columns:', len(categorical_cols))
print(categorical_cols)

## 5. Build Preprocessing Pipelines

In [None]:
# Create pipelines for each feature type
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

# Combine into single preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

print('Preprocessor created with:')
print('  - Numeric transformer (StandardScaler) for:', numeric_cols)
print('  - Categorical transformer (OneHotEncoder) for', len(categorical_cols), 'categorical features')

## 6. Apply Train/Test Split

In [None]:
# Split data (stratify to keep churn proportions in both sets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print('Train set size:', X_train.shape)
print('Test set size:', X_test.shape)
print('\nTrain churn rate:', round(y_train.mean(), 4))
print('Test churn rate:', round(y_test.mean(), 4))

## 7. Save Preprocessed Data

In [None]:
# Save preprocessed data and metadata for next notebook
import pickle

# Save data splits
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'numeric_cols': numeric_cols,
        'categorical_cols': categorical_cols
    }, f)

# Save preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print('Saved: preprocessed_data.pkl')
print('Saved: preprocessor.pkl')