In [3]:
"""
Telco Customer Churn Prediction Pipeline
A complete end-to-end machine learning pipeline for predicting customer churn
using scikit-learn's Pipeline API with hyperparameter tuning.

"""
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("=" * 70)
print("TELCO CUSTOMER CHURN PREDICTION PIPELINE")
print("=" * 70)

# STEP 1: LOAD THE DATASET
print("\n[STEP 1] Loading Telco Churn Dataset...")

# Download the dataset from a reliable source
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

try:
    df = pd.read_csv(url)
    print(f" Dataset loaded successfully!")
    print(f"  Shape: {df.shape[0]} rows × {df.shape[1]} columns")
except Exception as e:
    print(f" Error loading dataset: {e}")
    print("  Creating sample dataset for demonstration...")
    # Create a small sample dataset if download fails
    np.random.seed(RANDOM_STATE)
    n_samples = 1000
    df = pd.DataFrame({
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'SeniorCitizen': np.random.choice([0, 1], n_samples),
        'Partner': np.random.choice(['Yes', 'No'], n_samples),
        'Dependents': np.random.choice(['Yes', 'No'], n_samples),
        'tenure': np.random.randint(0, 73, n_samples),
        'PhoneService': np.random.choice(['Yes', 'No'], n_samples),
        'MultipleLines': np.random.choice(['Yes', 'No', 'No phone service'], n_samples),
        'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
        'OnlineSecurity': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'OnlineBackup': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'DeviceProtection': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'TechSupport': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'StreamingTV': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'StreamingMovies': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
        'PaperlessBilling': np.random.choice(['Yes', 'No'], n_samples),
        'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
        'MonthlyCharges': np.random.uniform(18, 120, n_samples),
        'TotalCharges': np.random.uniform(18, 8500, n_samples).astype(str),
        'Churn': np.random.choice(['Yes', 'No'], n_samples, p=[0.27, 0.73])
    })

# Display basic information
print("\n[Dataset Overview]")
print(df.head())
print("\n[Dataset Info]")
print(df.info())

# STEP 2: DATA PREPROCESSING
print("\n" + "=" * 70)
print("[STEP 2] Data Preprocessing...")
print("=" * 70)

# Handle missing values in TotalCharges (often stored as spaces)
print("\n[2.1] Handling missing values...")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
print(f" Missing values handled. Total missing: {df.isnull().sum().sum()}")

# Drop customerID if it exists (not useful for prediction)
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
    print(" Dropped 'customerID' column")

# Encode target variable
print("\n[2.2] Encoding target variable (Churn)...")
label_encoder = LabelEncoder()
df['Churn'] = label_encoder.fit_transform(df['Churn'])
print(f" Churn encoded: No=0, Yes=1")
print(f"  Class distribution: {df['Churn'].value_counts().to_dict()}")

# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"\n[2.3] Feature types identified:")
print(f"  Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"  Categorical features ({len(categorical_features)}): {categorical_features}")

# STEP 3: SPLIT DATA
print("\n" + "=" * 70)
print("[STEP 3] Splitting data into train and test sets...")
print("=" * 70)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"✓ Data split completed:")
print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")
print(f"  Train churn rate: {y_train.mean():.2%}")
print(f"  Test churn rate: {y_test.mean():.2%}")

# STEP 4: CREATE PREPROCESSING PIPELINE
print("\n" + "=" * 70)
print("[STEP 4] Creating preprocessing pipeline...")
print("=" * 70)

from sklearn.preprocessing import OneHotEncoder

# Numeric transformer: scale numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Categorical transformer: one-hot encode categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

print(" Preprocessing pipeline created:")
print(f"  - Numeric features: StandardScaler")
print(f"  - Categorical features: OneHotEncoder")

# STEP 5: CREATE MODEL PIPELINES
print("\n" + "=" * 70)
print("[STEP 5] Creating model pipelines...")
print("=" * 70)

# Pipeline 1: Logistic Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
])

# Pipeline 2: Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])

print(" Created two pipelines:")
print("  1. Logistic Regression Pipeline")
print("  2. Random Forest Pipeline")

# STEP 6: HYPERPARAMETER TUNING WITH GRIDSEARCHCV
print("\n" + "=" * 70)
print("[STEP 6] Hyperparameter tuning with GridSearchCV...")
print("=" * 70)

# Define parameter grids for each model
print("\n[6.1] Tuning Logistic Regression...")

lr_param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs', 'liblinear']
}

lr_grid_search = GridSearchCV(
    lr_pipeline,
    param_grid=lr_param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

lr_grid_search.fit(X_train, y_train)

print(f" Logistic Regression tuning complete!")
print(f"  Best parameters: {lr_grid_search.best_params_}")
print(f"  Best CV F1-score: {lr_grid_search.best_score_:.4f}")

# Tuning Random Forest
print("\n[6.2] Tuning Random Forest...")

rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

rf_grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=rf_param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

rf_grid_search.fit(X_train, y_train)

print(f" Random Forest tuning complete!")
print(f"  Best parameters: {rf_grid_search.best_params_}")
print(f"  Best CV F1-score: {rf_grid_search.best_score_:.4f}")

# STEP 7: MODEL EVALUATION
print("\n" + "=" * 70)
print("[STEP 7] Evaluating models on test set...")
print("=" * 70)

# Evaluate both models
models = {
    'Logistic Regression': lr_grid_search,
    'Random Forest': rf_grid_search
}

results = {}

for name, model in models.items():
    print(f"\n[{name}]")
    print("-" * 50)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'model': model
    }

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)

# Select best model based on F1-score
best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = results[best_model_name]['model']

print("\n" + "=" * 70)
print(f"BEST MODEL: {best_model_name}")
print(f"  Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"  F1-Score: {results[best_model_name]['f1_score']:.4f}")
print("=" * 70)

# STEP 8: VISUALIZATIONS
print("\n[STEP 8] Creating visualizations...")

# Create figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Model Comparison
metrics_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results.keys()],
    'F1-Score': [results[m]['f1_score'] for m in results.keys()]
})

x_pos = np.arange(len(metrics_df))
width = 0.35

axes[0].bar(x_pos - width/2, metrics_df['Accuracy'], width, label='Accuracy', alpha=0.8)
axes[0].bar(x_pos + width/2, metrics_df['F1-Score'], width, label='F1-Score', alpha=0.8)
axes[0].set_xlabel('Model', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[0].set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(metrics_df['Model'], rotation=15, ha='right')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim([0, 1])

# Plot 2: Confusion Matrix for Best Model
y_pred_best = best_model.predict(X_test)
cm_best = confusion_matrix(y_test, y_pred_best)

sns.heatmap(cm_best, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
axes[1].set_xlabel('Predicted Label', fontsize=12, fontweight='bold')
axes[1].set_ylabel('True Label', fontsize=12, fontweight='bold')
axes[1].set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('/content/churn_model_evaluation.png', dpi=300, bbox_inches='tight')
print(" Visualization saved: churn_model_evaluation.png")
plt.close()

# STEP 9: SAVE THE PIPELINE
print("\n" + "=" * 70)
print("[STEP 9] Saving the complete pipeline...")
print("=" * 70)

# Save the best model pipeline
pipeline_filename = '/content/telco_churn_pipeline.pkl'
joblib.dump(best_model, pipeline_filename)
print(f" Pipeline saved: {pipeline_filename}")

# Save label encoder as well
encoder_filename = '/content/label_encoder.pkl'
joblib.dump(label_encoder, encoder_filename)
print(f" Label encoder saved: {encoder_filename}")

# Verify the saved pipeline works
print("\n[9.1] Verifying saved pipeline...")
loaded_pipeline = joblib.load(pipeline_filename)
test_predictions = loaded_pipeline.predict(X_test[:5])
print(f" Pipeline loaded and tested successfully!")
print(f"  Sample predictions: {test_predictions}")

# STEP 10: USAGE EXAMPLE
print("\n" + "=" * 70)
print("[STEP 10] Example: Making predictions with the pipeline")
print("=" * 70)

# Example of how to use the saved pipeline
print("\n# Load the pipeline")
print("import joblib")
print("pipeline = joblib.load('telco_churn_pipeline.pkl')")
print("\n# Make predictions on new data")
print("predictions = pipeline.predict(new_customer_data)")
print("\n# Get prediction probabilities")
print("probabilities = pipeline.predict_proba(new_customer_data)")

# Demonstrate with actual data
print("\n[Live Example]")
sample_customer = X_test.iloc[:3]
predictions = loaded_pipeline.predict(sample_customer)
probabilities = loaded_pipeline.predict_proba(sample_customer)

for i in range(len(predictions)):
    churn_label = label_encoder.inverse_transform([predictions[i]])[0]
    churn_prob = probabilities[i][1]
    print(f"\nCustomer {i+1}:")
    print(f"  Prediction: {churn_label}")
    print(f"  Churn Probability: {churn_prob:.2%}")

# FINAL SUMMARY
print("\n" + "=" * 70)
print("PIPELINE EXECUTION COMPLETE!")
print("=" * 70)
print("\nSummary:")
print(f" Dataset loaded: {df.shape[0]} customers")
print(f" Features processed: {len(numeric_features)} numeric, {len(categorical_features)} categorical")
print(f" Models trained: Logistic Regression & Random Forest")
print(f" Best model: {best_model_name}")
print(f" Test Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f" Test F1-Score: {results[best_model_name]['f1_score']:.4f}")
print(f"\nFiles created:")
print(f"  1. telco_churn_pipeline.pkl (trained model)")
print(f"  2. label_encoder.pkl (target encoder)")
print(f"  3. churn_model_evaluation.png (visualizations)")
print("\n" + "=" * 70)

TELCO CUSTOMER CHURN PREDICTION PIPELINE

[STEP 1] Loading Telco Churn Dataset...
✓ Dataset loaded successfully!
  Shape: 7043 rows × 21 columns

[Dataset Overview]
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes