# Customer Churn Analysis

This notebook contains analysis of customer churn data to identify patterns and predictors of customer attrition.

## Setup

Import necessary libraries for data analysis and visualization.

In [None]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# XGBoost
import xgboost as xgb

# Interactive dashboard libraries
import ipywidgets as widgets
from IPython.display import display, HTML

# Set visualization styles
plt.style.use('default')
sns.set_style("whitegrid")

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries loaded successfully!")

## Step 1: Data Loading

Load the customer churn dataset.

In [None]:
# Load the dataset
df = pd.read_csv('Telco-Customer-Churn.csv')

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst 5 rows of the dataset:")
display(df.head())

# Check data types and missing values
print("\nData types and non-null counts:")
display(df.info())

# Summary statistics
print("\nSummary statistics:")
display(df.describe())

# Check for missing values
print("\nMissing values in each column:")
display(df.isnull().sum())

# Check unique values in categorical columns
print("\nUnique values in each column:")
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"{col}: {df[col].nunique()} unique values")
        print(df[col].value_counts().head(3))
        print()

# Check target variable distribution (class imbalance)
print("\nTarget variable distribution:")
churn_distribution = df['Churn'].value_counts(normalize=True) * 100
display(churn_distribution)
print(f"Class imbalance ratio: 1:{round(churn_distribution.iloc[0]/churn_distribution.iloc[1], 2)}")

# Plot the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='Churn', data=df)
plt.title('Distribution of Customer Churn')
plt.ylabel('Count')
plt.show()

## Step 2: Checking multicollinearity



In [None]:
# Check target variable distribution (class imbalance)
print("\nTarget variable distribution:")
churn_distribution = df['Churn'].value_counts(normalize=True) * 100
display(churn_distribution)
print(f"Class imbalance ratio: 1:{round(churn_distribution.iloc[0]/churn_distribution.iloc[1], 2)}")

# Create correlation matrix to check for multicollinearity
# First, identify numerical columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
if 'Churn' in numeric_cols:
    numeric_cols.remove('Churn')

# Create correlation matrix for numerical features
if len(numeric_cols) > 1:
    print("\nCorrelation matrix for numerical features (check for multicollinearity):")
    corr_matrix = df[numeric_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix for Numerical Features')
    plt.tight_layout()
    plt.show()




## Step 3: Exploratory Data Analysis

First, let's explore each feature's contribution to the target variable.

In [None]:
# EDA - Part 1: Focus on key categorical features
key_categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                           'InternetService', 'Contract', 'PaperlessBilling', 'PaymentMethod']

# Plot individual features vs churn rate
for feature in key_categorical_features:
    plt.figure(figsize=(8, 5))
    # Calculate the percentage of customers who churned for each category
    # Use lambda function to convert Y/N to 1/0 (feature engineering)
    churn_rate = df.groupby(feature)['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)

    
    # Plot the bar chart
    ax = churn_rate.plot(kind='bar', color='skyblue')
    plt.title(f'Churn Rate by {feature}')
    plt.ylabel('Churn Rate (%)')
    plt.xlabel(feature)
    
    # Add percentage labels
    for i, v in enumerate(churn_rate):
        plt.text(i, v + 1, f"{v:.1f}%", ha='center')
    
    plt.tight_layout()
    plt.show()

In [None]:
# EDA - Part 2: Numerical features
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Plot boxplots individually
for feature in numerical_features:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='Churn', y=feature, data=df)
    plt.title(f'{feature} by Churn Status')
    plt.tight_layout()
    plt.show()

# First, create the Churn_Binary column
df['Churn_Binary'] = (df['Churn'] == 'Yes').astype(int)

# Create tenure bins for easier interpretation
df['tenure_group'] = pd.cut(df['tenure'], 
                           bins=[0, 12, 24, 36, 48, 60, 72], 
                           labels=['0-12', '13-24', '25-36', '37-48', '49-60', '61-72'])

# Plot churn rate by tenure group
plt.figure(figsize=(8, 5))
tenure_churn = df.groupby('tenure_group')['Churn_Binary'].mean() * 100
tenure_churn.plot(kind='bar', color='skyblue')
plt.title('Churn Rate by Tenure Group (Months)')
plt.ylabel('Churn Rate (%)')
plt.xlabel('Tenure Group')
for i, v in enumerate(tenure_churn):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center')
plt.tight_layout()
plt.show()

In [None]:
# EDA - Part 3: Key insights plot

# Contract type is usually the strongest predictor
plt.figure(figsize=(8, 5))
contract_churn = df.groupby('Contract')['Churn_Binary'].mean() * 100
contract_churn.plot(kind='bar', color='skyblue')
plt.title('Churn Rate by Contract Type')
plt.ylabel('Churn Rate (%)')
plt.xlabel('Contract Type')
for i, v in enumerate(contract_churn):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center')
plt.tight_layout()
plt.show()

# Internet service type analysis
plt.figure(figsize=(8, 5))
internet_churn = df.groupby('InternetService')['Churn_Binary'].mean() * 100
internet_churn.plot(kind='bar', color='skyblue')
plt.title('Churn Rate by Internet Service Type')
plt.ylabel('Churn Rate (%)')
plt.xlabel('Internet Service')
for i, v in enumerate(internet_churn):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center')
plt.tight_layout()
plt.show()

# Monthly charges distribution by churn status
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x='MonthlyCharges', hue='Churn', bins=20, kde=True)
plt.title('Monthly Charges Distribution by Churn Status')
plt.tight_layout()
plt.show()

# Basic scatter plot - tenure vs monthly charges
plt.figure(figsize=(8, 5))
sns.scatterplot(x='tenure', y='MonthlyCharges', hue='Churn', data=df)
plt.title('Monthly Charges vs Tenure by Churn Status')
plt.tight_layout()
plt.show()

In [None]:
# 1. Visualizing Payment Methods vs Churn

# Bar chart of churn rate by payment method
plt.figure(figsize=(10, 6))
payment_churn = df.groupby('PaymentMethod')['Churn_Binary'].mean() * 100
payment_churn.sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title('Churn Rate by Payment Method', fontsize=14)
plt.xlabel('Payment Method', fontsize=12)
plt.ylabel('Churn Rate (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add percentage labels on top of bars
for i, v in enumerate(payment_churn.sort_values(ascending=False)):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center', fontweight='bold')
    
plt.tight_layout()
plt.show()

# Count plot showing distribution of customers by payment method and churn
plt.figure(figsize=(12, 7))
ax = sns.countplot(x='PaymentMethod', hue='Churn', data=df, palette=['#3498db', '#e74c3c'])
plt.title('Customer Distribution by Payment Method and Churn Status', fontsize=14)
plt.xlabel('Payment Method', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Churn Status')

# Add count labels on top of bars
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 20, f'{height}', 
            ha = 'center', fontweight='bold')
            
plt.tight_layout()
plt.show()

In [None]:
# 2. Visualizing Additional Services vs Churn

# List of additional service columns
service_cols = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Create a figure with subplots for each service
plt.figure(figsize=(15, 12))

for i, service in enumerate(service_cols, 1):
    plt.subplot(4, 2, i)
    
    # Calculate churn rate for each category in the service
    service_churn = df.groupby(service)['Churn_Binary'].mean() * 100
    service_churn.sort_values(ascending=False).plot(kind='bar', color='lightgreen')
    
    plt.title(f'Churn Rate by {service}', fontsize=12)
    plt.ylabel('Churn Rate (%)')
    plt.ylim(0, service_churn.max() * 1.2)  # Set y-axis limit with some padding
    
    # Add percentage labels
    for j, v in enumerate(service_churn.sort_values(ascending=False)):
        plt.text(j, v + 1, f"{v:.1f}%", ha='center', fontweight='bold')

plt.tight_layout()
plt.subplots_adjust(hspace=0.5)
plt.show()

# Create a service heatmap to visualize combinations
# First, let's look at internet-related services only
internet_services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                     'TechSupport', 'StreamingTV', 'StreamingMovies']

# Create binary indicators for each service (1 if 'Yes', 0 otherwise)
for service in internet_services:
    df[f'{service}_Yes'] = (df[service] == 'Yes').astype(int)

# Calculate service count for customers with internet
internet_customers = df[df['InternetService'] != 'No']
internet_customers['ServiceCount'] = internet_customers[[f'{service}_Yes' for service in internet_services]].sum(axis=1)

# Plot churn rate by number of additional services
plt.figure(figsize=(10, 6))
service_count_churn = internet_customers.groupby('ServiceCount')['Churn_Binary'].mean() * 100
service_count_churn.plot(kind='bar', color='#9b59b6')
plt.title('Churn Rate by Number of Additional Internet Services', fontsize=14)
plt.xlabel('Number of Additional Services', fontsize=12)
plt.ylabel('Churn Rate (%)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add percentage labels
for i, v in enumerate(service_count_churn):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center', fontweight='bold')
    
plt.tight_layout()
plt.show()

# Create a heatmap of service combinations vs InternetService type
plt.figure(figsize=(12, 8))
pivot_table = df.pivot_table(values='Churn_Binary', 
                             index='InternetService',
                             columns='Contract', 
                             aggfunc='mean') * 100

sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='.1f', linewidths=0.5)
plt.title('Churn Rate (%) by Internet Service and Contract Type', fontsize=14)
plt.tight_layout()
plt.show()

## Step 4: Data preprocessing and Feature Engineering

In [None]:
# Data Preprocessing and Feature Engineering

# 1. Check for and handle missing values
print("Missing values in each column:")
print(df.isnull().sum())

# If 'TotalCharges' has missing values, they're likely new customers with tenure=0
# Handle missing values in TotalCharges if any
# Convert your TotalCharges column from object to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
if df['TotalCharges'].isnull().sum() > 0:
    # Replace with 0 or MonthlyCharges based on business logic
    df['TotalCharges'] = df['TotalCharges'].fillna(0)

# 2. Convert 'Churn' column to binary
if df['Churn'].dtype == 'object':
    df['Churn_Binary'] = df['Churn'].map({'Yes': 1, 'No': 0})
else:
    df['Churn_Binary'] = df['Churn']

# 3. Feature Engineering

# Tenure feature engineering
# Create tenure groups 
df['tenure'] = pd.to_numeric(df['tenure'])
if 'tenure_group' not in df.columns:
    df['tenure_group'] = pd.cut(df['tenure'], 
                              bins=[0, 12, 24, 36, 48, 60, 72], 
                              labels=['0-12', '13-24', '25-36', '37-48', '49-60', '61-72'])

# Calculate monthly charges to total charges ratio (customer value)
df['MonthlyToTotalRatio'] = df['MonthlyCharges'] / (df['TotalCharges'] + 1)  # Add 1 to avoid division by zero

# Create service count feature for additional services
service_cols = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Initialize a count column with zeros
df['ServiceCount'] = 0

# Count 'Yes' values in service columns
for col in service_cols:
    df['ServiceCount'] += (df[col] == 'Yes').astype(int)

# Create internet service indicator
df['HasInternetService'] = (df['InternetService'] != 'No').astype(int)

# Create an interaction feature for contract type and tenure
# First, encode contract type numerically
df['ContractValue'] = df['Contract'].map({'Month-to-month': 0, 'One year': 1, 'Two year': 2})
df['TenureContract'] = df['tenure'] * df['ContractValue']
# This creates a new feature that combines tenure and contract type
# Higher values indicate customers with longer tenure and longer contracts

# Create binary features for each service
for col in service_cols:
    df[f'{col}_Yes'] = (df[col] == 'Yes').astype(int)
# This converts categorical service columns to binary (0/1) features
# Makes it easier for machine learning models to process

# Print the new feature columns
print("\nFeature engineered columns:")
print([col for col in df.columns if col.startswith(('tenure_group', 'MonthlyToTotalRatio', 'ServiceCount', 'HasInternetService', 'ContractValue', 'TenureContract')) or '_Yes' in col])
# This will print the newly created feature engineered columns

# 4. Separate features and target variable
y = df['Churn_Binary']

# Identify categorical columns that need encoding
categorical_cols = [col for col in df.select_dtypes(include=['object']).columns 
                   if col != 'Churn' and col != 'tenure_group']

# Identify numerical columns for scaling
numerical_cols = [col for col in df.select_dtypes(include=['float64', 'int64']).columns 
                 if col != 'Churn_Binary' and col != 'ContractValue']

# Select features to use for modeling
feature_cols = numerical_cols + categorical_cols

X = df[feature_cols]

print("\nFeatures for modeling:")
print(feature_cols)

# 5. Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Churn rate in training set: {y_train.mean()*100:.2f}%")
print(f"Churn rate in testing set: {y_test.mean()*100:.2f}%")

## Step 5: Build preprocessing pipeline and baseline models

In [None]:
# Create a preprocessing pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Define preprocessing for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Print preprocessing pipeline
print("Preprocessing pipeline created.")

In [None]:
# Create and evaluate baseline models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

# Initialize models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42)
}

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    # Calculate Precision-Recall curve and Average Precision
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)
    
    # Display results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print("\nClassification Report:")
    print(report)
    
    print("\nConfusion Matrix:")
    print(cm)
    
    # Plot ROC curve
    plt.figure(figsize=(12, 5))
    
    # ROC curve subplot
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    
    # Precision-Recall curve subplot
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, color='green', lw=2, 
             label=f'Precision-Recall curve (AP = {avg_precision:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    
    plt.tight_layout()
    plt.show()
    
    return pipeline, accuracy, roc_auc, avg_precision

# Evaluate each model
results = {}
for name, model in models.items():
    print(f"\n{'-'*50}")
    print(f"Evaluating {name}...")
    pipeline, accuracy, roc_auc, avg_precision = evaluate_model(
        model, X_train, X_test, y_train, y_test
    )
    results[name] = {
        'pipeline': pipeline,
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'avg_precision': avg_precision
    }

# Compare model performance
plt.figure(figsize=(10, 6))

model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
roc_aucs = [results[name]['roc_auc'] for name in model_names]
avg_precisions = [results[name]['avg_precision'] for name in model_names]

x = range(len(model_names))
width = 0.25

plt.bar([i - width for i in x], accuracies, width, label='Accuracy', color='#3498db')
plt.bar(x, roc_aucs, width, label='ROC AUC', color='#e74c3c')
plt.bar([i + width for i in x], avg_precisions, width, label='Average Precision', color='#2ecc71')

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x, model_names)
plt.ylim([0, 1])
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels on top of bars
for i, v in enumerate(accuracies):
    plt.text(i - width, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
for i, v in enumerate(roc_aucs):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
for i, v in enumerate(avg_precisions):
    plt.text(i + width, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## Step 6: XGBoost Model Optimisation

In [None]:
# XGBoost Model Optimization
from sklearn.model_selection import GridSearchCV

print("Optimizing XGBoost model...")

# Create the pipeline with XGBoost
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBClassifier(random_state=42))
])

# Define hyperparameter grid
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.01, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__min_child_weight': [1, 5],
    'model__gamma': [0, 0.1, 0.2]
}

# Use a smaller grid for demonstration (faster execution)
param_grid_small = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.1],
}

# Create grid search
grid_search = GridSearchCV(
    xgb_pipeline,
    param_grid_small,  # Use small grid for demonstration
    cv=5,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best parameters
print("\nBest parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

# Get best model
best_xgb_model = grid_search.best_estimator_

# Evaluate best model
print("\nEvaluating optimized XGBoost model...")
_, accuracy, roc_auc, avg_precision = evaluate_model(
    grid_search.best_estimator_.named_steps['model'], 
    X_train, X_test, y_train, y_test
)

# Store optimized XGBoost results
results['Optimized XGBoost'] = {
    'pipeline': best_xgb_model,
    'accuracy': accuracy,
    'roc_auc': roc_auc,
    'avg_precision': avg_precision
}

# Feature importance for XGBoost
best_xgb = best_xgb_model.named_steps['model']
preprocessor = best_xgb_model.named_steps['preprocessor']

# Transform the features
X_train_transformed = preprocessor.transform(X_train)

# Get feature names
feature_names = []

# Get numerical feature names
numerical_features = preprocessor.transformers_[0][2]
feature_names.extend(numerical_features)

# Get one-hot encoded categorical feature names
categorical_features = preprocessor.transformers_[1][2]
ohe = preprocessor.transformers_[1][1].named_steps['onehot']
cat_feature_names = ohe.get_feature_names_out(categorical_features)
feature_names.extend(cat_feature_names)

# Plot feature importance
plt.figure(figsize=(12, 8))
importance = best_xgb.feature_importances_
indices = np.argsort(importance)[-5:]  # Get top 5 features

plt.barh(range(len(indices)), importance[indices], color='skyblue')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance')
plt.title('Top 5 Most Important Features (XGBoost)')
plt.tight_layout()
plt.show()

## Step 7: Select best model for interactive dashboard

In [None]:
# Determine best performing model based on accuracy
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_pipeline = results[best_model_name]['pipeline']
best_accuracy = results[best_model_name]['accuracy']

print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")

# Save the model for later use in our dashboard
import pickle

# Create a dictionary with all necessary components for prediction
model_components = {
    'pipeline': best_pipeline,
    'categorical_cols': categorical_cols,
    'numerical_cols': numerical_cols,
    'feature_cols': feature_cols
}

# Save to a file
with open('telco_churn_model.pkl', 'wb') as file:
    pickle.dump(model_components, file)

print("Model saved as 'telco_churn_model.pkl'")

## Step 8: Build interactive dashboard