In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
import xgboost as xgb
import shap
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully")
print("Python environment ready for churn prediction analysis")


In [None]:
def generate_telecom_data(n_customers=10000):
    """
    Generate synthetic telecom customer data with realistic relationships between features.
    
    Parameters:
    n_customers (int): Number of customers to generate
    
    Returns:
    pd.DataFrame: Synthetic customer dataset
    """
    print(f"Generating synthetic data for {n_customers:,} customers...")
    
    # Initialize lists to store generated data
    data = {
        'customer_id': [f'CUST_{i+1:06d}' for i in range(n_customers)],
        'age': np.random.normal(45, 15, n_customers).astype(int),
        'gender': np.random.choice(['Male', 'Female'], n_customers),
        'tenure_months': np.random.exponential(20, n_customers).astype(int),
        'contract_type': np.random.choice(['month-to-month', 'one-year', 'two-year'], 
                                        n_customers, p=[0.5, 0.3, 0.2]),
        'internet_service': np.random.choice(['DSL', 'Fiber', 'None'], 
                                           n_customers, p=[0.4, 0.4, 0.2]),
        'tech_support': np.random.choice(['Yes', 'No'], n_customers, p=[0.3, 0.7]),
        'streaming_services': np.random.choice(['Yes', 'No'], n_customers, p=[0.4, 0.6]),
        'payment_method': np.random.choice(['Electronic check', 'Mailed check', 
                                          'Bank transfer', 'Credit card'], 
                                         n_customers, p=[0.35, 0.2, 0.2, 0.25])
    }
    
    # Clip age to realistic range
    data['age'] = np.clip(data['age'], 18, 80)
    
    # Clip tenure to realistic range
    data['tenure_months'] = np.clip(data['tenure_months'], 1, 72)
    
    # Generate monthly charges based on internet service type
    monthly_charges = []
    for service in data['internet_service']:
        if service == 'None':
            charge = np.random.normal(35, 10)
        elif service == 'DSL':
            charge = np.random.normal(55, 15)
        else:  # Fiber
            charge = np.random.normal(85, 20)
        monthly_charges.append(max(20, min(120, charge)))  # Clip to realistic range
    
    data['monthly_charges'] = np.round(monthly_charges, 2)
    
    # Generate total charges based on tenure and monthly charges
    data['total_charges'] = np.round(
        np.array(data['tenure_months']) * np.array(data['monthly_charges']) + 
        np.random.normal(0, 100, n_customers), 2
    )
    data['total_charges'] = np.maximum(data['total_charges'], 0)  # Ensure non-negative
    
    # Generate churn with realistic relationships
    churn_probs = []
    for i in range(n_customers):
        prob = 0.15  # Base churn rate
        
        # Contract type influence
        if data['contract_type'][i] == 'month-to-month':
            prob += 0.25
        elif data['contract_type'][i] == 'one-year':
            prob += 0.1
        # two-year contracts get no additional churn risk
        
        # Tenure influence (longer tenure = lower churn)
        if data['tenure_months'][i] < 6:
            prob += 0.2
        elif data['tenure_months'][i] < 12:
            prob += 0.1
        elif data['tenure_months'][i] > 24:
            prob -= 0.1
        
        # Monthly charges influence (very high charges increase churn)
        if data['monthly_charges'][i] > 90:
            prob += 0.15
        elif data['monthly_charges'][i] < 30:
            prob += 0.1
        
        # Tech support influence (having support reduces churn)
        if data['tech_support'][i] == 'Yes':
            prob -= 0.08
        
        # Payment method influence (electronic check increases churn)
        if data['payment_method'][i] == 'Electronic check':
            prob += 0.1
        
        # Age influence (very young or very old customers churn more)
        if data['age'][i] < 25 or data['age'][i] > 65:
            prob += 0.05
        
        churn_probs.append(max(0.01, min(0.8, prob)))  # Clip probability
    
    # Generate actual churn based on probabilities
    data['churn'] = np.random.binomial(1, churn_probs, n_customers)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    print(f"Data generation complete!")
    print(f"Dataset shape: {df.shape}")
    print(f"Churn rate: {df['churn'].mean():.2%}")
    
    return df

# Generate the dataset
telecom_data = generate_telecom_data(10000)

# Display basic information about the dataset
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
print(telecom_data.head())


In [None]:
# Data quality assessment
print("DATA QUALITY ASSESSMENT")
print("="*40)
print(f"Dataset shape: {telecom_data.shape}")
print(f"Total records: {len(telecom_data):,}")
print()

print("Data types:")
print(telecom_data.dtypes)
print()

print("Missing values:")
missing_values = telecom_data.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found")
print()

print("Statistical summary of numerical features:")
print(telecom_data.describe())
print()

print("Categorical feature distributions:")
categorical_features = ['gender', 'contract_type', 'internet_service', 'tech_support', 
                       'streaming_services', 'payment_method']

for feature in categorical_features:
    print(f"\n{feature}:")
    print(telecom_data[feature].value_counts())
    print(f"Unique values: {telecom_data[feature].nunique()}")


In [None]:
# Create a copy of the data for preprocessing
df_processed = telecom_data.copy()

print("FEATURE PREPROCESSING")
print("="*40)

# Separate features and target
X = df_processed.drop(['customer_id', 'churn'], axis=1)
y = df_processed['churn']

print(f"Features shape: {X.shape}")
print(f"Target distribution:")
print(y.value_counts(normalize=True))
print()

# Identify numerical and categorical features
numerical_features = ['age', 'tenure_months', 'monthly_charges', 'total_charges']
categorical_features = ['gender', 'contract_type', 'internet_service', 'tech_support', 
                       'streaming_services', 'payment_method']

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")
print()

# Encode categorical features using Label Encoding
label_encoders = {}
X_encoded = X.copy()

print("Encoding categorical features...")
for feature in categorical_features:
    le = LabelEncoder()
    X_encoded[feature] = le.fit_transform(X[feature])
    label_encoders[feature] = le
    print(f"  {feature}: {len(le.classes_)} unique values -> {le.classes_}")

print("\nEncoded feature sample:")
print(X_encoded.head())
print()

# Create feature scaling
print("Scaling numerical features...")
scaler = StandardScaler()
X_scaled = X_encoded.copy()
X_scaled[numerical_features] = scaler.fit_transform(X_encoded[numerical_features])

print("Scaling statistics:")
for i, feature in enumerate(numerical_features):
    mean_val = scaler.mean_[i]
    std_val = scaler.scale_[i]
    print(f"  {feature}: mean={mean_val:.2f}, std={std_val:.2f}")

print("\nScaled feature sample:")
print(X_scaled[numerical_features].head())


In [None]:
# Train-test split with stratification to preserve churn distribution
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print("TRAIN-TEST SPLIT RESULTS")
print("="*30)
print(f"Training set size: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X_scaled):.1%})")
print(f"Test set size: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X_scaled):.1%})")
print()

print("Churn distribution preservation:")
print("Training set:")
print(y_train.value_counts(normalize=True).sort_index())
print("Test set:")
print(y_test.value_counts(normalize=True).sort_index())
print()

print("Feature columns in final dataset:")
print(list(X_train.columns))


In [None]:
# Calculate baseline churn metrics
print("BASELINE CHURN ANALYSIS")
print("="*40)

# Overall churn rate
overall_churn_rate = telecom_data['churn'].mean()
print(f"Overall churn rate: {overall_churn_rate:.2%}")
print(f"Total customers: {len(telecom_data):,}")
print(f"Churned customers: {telecom_data['churn'].sum():,}")
print(f"Retained customers: {(telecom_data['churn'] == 0).sum():,}")
print()

# Churn by contract type
print("CHURN RATE BY CONTRACT TYPE")
print("-" * 30)
contract_churn = telecom_data.groupby('contract_type')['churn'].agg(['count', 'sum', 'mean']).round(3)
contract_churn.columns = ['Total_Customers', 'Churned_Customers', 'Churn_Rate']
contract_churn['Churn_Percentage'] = (contract_churn['Churn_Rate'] * 100).round(1)

print(contract_churn)
print()

# Additional baseline metrics by key categorical features
features_to_analyze = ['internet_service', 'tech_support', 'payment_method']

for feature in features_to_analyze:
    print(f"CHURN RATE BY {feature.upper().replace('_', ' ')}")
    print("-" * 40)
    feature_churn = telecom_data.groupby(feature)['churn'].agg(['count', 'mean']).round(3)
    feature_churn.columns = ['Total_Customers', 'Churn_Rate']
    feature_churn['Churn_Percentage'] = (feature_churn['Churn_Rate'] * 100).round(1)
    feature_churn = feature_churn.sort_values('Churn_Rate', ascending=False)
    print(feature_churn)
    print()


In [None]:
# Create comprehensive EDA visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Customer Churn Analysis - Key Patterns', fontsize=16, y=1.02)

# 1. Overall churn distribution
churn_counts = telecom_data['churn'].value_counts()
axes[0, 0].pie(churn_counts.values, labels=['Retained', 'Churned'], autopct='%1.1f%%', 
               colors=['lightblue', 'salmon'])
axes[0, 0].set_title('Overall Churn Distribution')

# 2. Churn by contract type
contract_churn_pct = telecom_data.groupby('contract_type')['churn'].mean() * 100
contract_churn_pct.plot(kind='bar', ax=axes[0, 1], color='skyblue', rot=45)
axes[0, 1].set_title('Churn Rate by Contract Type')
axes[0, 1].set_ylabel('Churn Rate (%)')
axes[0, 1].grid(axis='y', alpha=0.3)

# 3. Churn by tenure (binned)
telecom_data['tenure_group'] = pd.cut(telecom_data['tenure_months'], 
                                     bins=[0, 12, 24, 36, 72], 
                                     labels=['0-12', '13-24', '25-36', '37+'])
tenure_churn = telecom_data.groupby('tenure_group')['churn'].mean() * 100
tenure_churn.plot(kind='bar', ax=axes[0, 2], color='lightgreen', rot=0)
axes[0, 2].set_title('Churn Rate by Tenure Group (months)')
axes[0, 2].set_ylabel('Churn Rate (%)')
axes[0, 2].grid(axis='y', alpha=0.3)

# 4. Monthly charges distribution by churn
churned = telecom_data[telecom_data['churn'] == 1]['monthly_charges']
retained = telecom_data[telecom_data['churn'] == 0]['monthly_charges']
axes[1, 0].hist([retained, churned], bins=30, alpha=0.7, 
               label=['Retained', 'Churned'], color=['lightblue', 'salmon'])
axes[1, 0].set_title('Monthly Charges Distribution by Churn')
axes[1, 0].set_xlabel('Monthly Charges ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# 5. Churn by internet service
internet_churn = telecom_data.groupby('internet_service')['churn'].mean() * 100
internet_churn.plot(kind='bar', ax=axes[1, 1], color='orange', rot=45)
axes[1, 1].set_title('Churn Rate by Internet Service')
axes[1, 1].set_ylabel('Churn Rate (%)')
axes[1, 1].grid(axis='y', alpha=0.3)

# 6. Churn by payment method
payment_churn = telecom_data.groupby('payment_method')['churn'].mean() * 100
payment_churn.plot(kind='bar', ax=axes[1, 2], color='pink', rot=45)
axes[1, 2].set_title('Churn Rate by Payment Method')
axes[1, 2].set_ylabel('Churn Rate (%)')
axes[1, 2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Additional correlation analysis
print("\nCORRELATION ANALYSIS")
print("="*30)
correlation_matrix = X_encoded.corrwith(y).sort_values(ascending=False)
print("Feature correlation with churn:")
print(correlation_matrix.round(3))


In [None]:
# Train Logistic Regression model
print("LOGISTIC REGRESSION MODEL")
print("="*40)

# Initialize and train the model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]

# Calculate metrics
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")
print()

# ROC Curve and AUC
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
lr_auc = auc(fpr_lr, tpr_lr)
print(f"AUC-ROC: {lr_auc:.4f}")
print()

# Confusion Matrix
print("Confusion Matrix:")
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)
print()

# Classification Report
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['Retained', 'Churned']))

# Feature importance (coefficients)
feature_importance_lr = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lr_model.coef_[0],
    'Abs_Coefficient': np.abs(lr_model.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

print("Top 10 Most Important Features (Logistic Regression):")
print(feature_importance_lr.head(10))


In [None]:
# XGBoost Model with Hyperparameter Tuning
print("XGBOOST MODEL WITH HYPERPARAMETER TUNING")
print("="*50)

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

# Perform grid search with cross-validation
print("Performing hyperparameter tuning...")
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validation AUC: {best_score:.4f}")
print()

# Train the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions
y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_proba_xgb = best_xgb_model.predict_proba(X_test)[:, 1]

# Calculate metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)

print("XGBOOST MODEL RESULTS")
print("="*30)
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")
print()

# ROC Curve and AUC
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)
xgb_auc = auc(fpr_xgb, tpr_xgb)
print(f"AUC-ROC: {xgb_auc:.4f}")
print()

# Confusion Matrix
print("Confusion Matrix:")
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print(cm_xgb)
print()

# Classification Report
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Retained', 'Churned']))

# Calculate improvement over baseline
auc_improvement = ((xgb_auc - lr_auc) / lr_auc) * 100
print(f"\nModel Improvement:")
print(f"Logistic Regression AUC: {lr_auc:.4f}")
print(f"XGBoost AUC: {xgb_auc:.4f}")
print(f"AUC Improvement: {auc_improvement:.1f}%")


In [None]:
# ROC Curve Comparison
plt.figure(figsize=(10, 8))

# Plot ROC curves for both models
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {lr_auc:.3f})', 
         linewidth=2, color='blue')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {xgb_auc:.3f})', 
         linewidth=2, color='red')

# Plot diagonal line (random classifier)
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random Classifier (AUC = 0.500)')

# Formatting
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve Comparison - Churn Prediction Models', fontsize=14)
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

# Model comparison summary
print("MODEL COMPARISON SUMMARY")
print("="*40)

comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC'],
    'Logistic Regression': [lr_accuracy, lr_precision, lr_recall, lr_f1, lr_auc],
    'XGBoost': [xgb_accuracy, xgb_precision, xgb_recall, xgb_f1, xgb_auc]
}).round(4)

comparison_df['Improvement'] = ((comparison_df['XGBoost'] - comparison_df['Logistic Regression']) / 
                               comparison_df['Logistic Regression'] * 100).round(1)

print(comparison_df.to_string(index=False))
print(f"\nAverage improvement: {comparison_df['Improvement'].mean():.1f}%")


In [None]:
# Feature Importance Analysis
print("FEATURE IMPORTANCE ANALYSIS")
print("="*40)

# XGBoost built-in feature importance
xgb_feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 10 Most Important Features (XGBoost):")
print(xgb_feature_importance.head(10))
print()

# Visualize XGBoost feature importance
plt.figure(figsize=(12, 8))
top_features = xgb_feature_importance.head(10)
plt.barh(range(len(top_features)), top_features['Importance'], color='skyblue')
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Feature Importance (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# SHAP Analysis
print("SHAP ANALYSIS")
print("="*20)
print("Computing SHAP values for model interpretability...")

# Create SHAP explainer
explainer = shap.TreeExplainer(best_xgb_model)
shap_values = explainer.shap_values(X_test[:1000])  # Use subset for efficiency

# SHAP summary plot
print("Generating SHAP summary plot...")
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test[:1000], feature_names=X_train.columns, show=False)
plt.title('SHAP Feature Importance Summary')
plt.tight_layout()
plt.show()

# SHAP feature importance (mean absolute SHAP values)
shap_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'SHAP_Importance': np.abs(shap_values).mean(0)
}).sort_values('SHAP_Importance', ascending=False)

print("Top 10 Features by SHAP Importance:")
print(shap_importance.head(10))

# Combined feature importance comparison
plt.figure(figsize=(14, 8))

# Normalize importance scores for comparison
xgb_norm = xgb_feature_importance.set_index('Feature')['Importance'] / xgb_feature_importance['Importance'].max()
shap_norm = shap_importance.set_index('Feature')['SHAP_Importance'] / shap_importance['SHAP_Importance'].max()

# Get top 10 features from XGBoost
top_10_features = xgb_feature_importance.head(10)['Feature'].tolist()

# Create comparison plot
x = np.arange(len(top_10_features))
width = 0.35

plt.bar(x - width/2, [xgb_norm[f] for f in top_10_features], width, 
        label='XGBoost Importance', alpha=0.8, color='lightblue')
plt.bar(x + width/2, [shap_norm[f] for f in top_10_features], width, 
        label='SHAP Importance', alpha=0.8, color='lightcoral')

plt.xlabel('Features')
plt.ylabel('Normalized Importance')
plt.title('Feature Importance Comparison: XGBoost vs SHAP')
plt.xticks(x, top_10_features, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Business Impact Analysis
print("BUSINESS IMPACT ANALYSIS")
print("="*40)

# Get test set with predictions
test_results = X_test.copy()
test_results['actual_churn'] = y_test
test_results['churn_probability'] = y_pred_proba_xgb
test_results['predicted_churn'] = y_pred_xgb

# Add original customer data for revenue calculations
test_indices = y_test.index
test_results['monthly_charges'] = telecom_data.loc[test_indices, 'monthly_charges'].values
test_results['customer_id'] = telecom_data.loc[test_indices, 'customer_id'].values

# Identify top 10% high-risk customers
top_10_percent = int(len(test_results) * 0.1)
high_risk_customers = test_results.nlargest(top_10_percent, 'churn_probability')

print(f"Total test customers: {len(test_results):,}")
print(f"Top 10% high-risk customers: {len(high_risk_customers):,}")
print(f"High-risk threshold probability: {high_risk_customers['churn_probability'].min():.3f}")
print()

# Calculate current metrics for high-risk group
high_risk_actual_churn = high_risk_customers['actual_churn'].sum()
high_risk_churn_rate = high_risk_customers['actual_churn'].mean()
high_risk_monthly_revenue = high_risk_customers['monthly_charges'].sum()

print("HIGH-RISK CUSTOMER ANALYSIS")
print("-" * 30)
print(f"Actual churners in high-risk group: {high_risk_actual_churn:,}")
print(f"Churn rate in high-risk group: {high_risk_churn_rate:.1%}")
print(f"Monthly revenue from high-risk group: ${high_risk_monthly_revenue:,.2f}")
print()

# Business impact assumptions
retention_success_rate = 0.40  # Assume 40% of targeted customers can be retained
avg_customer_lifetime_months = 24  # Average customer lifetime
intervention_cost_per_customer = 50  # Cost to intervene per customer

# Calculate potential revenue saved
churners_in_high_risk = high_risk_customers[high_risk_customers['actual_churn'] == 1]
potential_saves = len(churners_in_high_risk) * retention_success_rate
monthly_revenue_saved = churners_in_high_risk['monthly_charges'].sum() * retention_success_rate
lifetime_revenue_saved = monthly_revenue_saved * avg_customer_lifetime_months

# Calculate intervention costs
total_intervention_cost = len(high_risk_customers) * intervention_cost_per_customer

# Calculate net benefit
net_benefit = lifetime_revenue_saved - total_intervention_cost

print("BUSINESS IMPACT SIMULATION")
print("-" * 35)
print(f"Retention success rate assumption: {retention_success_rate:.0%}")
print(f"Average customer lifetime: {avg_customer_lifetime_months} months")
print(f"Intervention cost per customer: ${intervention_cost_per_customer}")
print()
print(f"Potential customers saved: {potential_saves:.0f}")
print(f"Monthly revenue saved: ${monthly_revenue_saved:,.2f}")
print(f"Lifetime revenue saved: ${lifetime_revenue_saved:,.2f}")
print(f"Total intervention cost: ${total_intervention_cost:,.2f}")
print(f"Net benefit: ${net_benefit:,.2f}")
print(f"ROI: {(net_benefit / total_intervention_cost) * 100:.1f}%")
print()

# Scale to full customer base
full_customer_base = 10000
scaling_factor = full_customer_base / len(test_results)
scaled_net_benefit = net_benefit * scaling_factor

print("SCALED BUSINESS IMPACT (Full Customer Base)")
print("-" * 45)
print(f"Estimated annual net benefit: ${scaled_net_benefit * 12:,.2f}")
print(f"Estimated monthly net benefit: ${scaled_net_benefit:,.2f}")

# Create visualization of business impact
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Churn probability distribution
ax1.hist(test_results['churn_probability'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
ax1.axvline(high_risk_customers['churn_probability'].min(), color='red', linestyle='--', 
           label=f'Top 10% Threshold ({high_risk_customers["churn_probability"].min():.3f})')
ax1.set_xlabel('Churn Probability')
ax1.set_ylabel('Number of Customers')
ax1.set_title('Churn Probability Distribution')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Revenue impact breakdown
categories = ['Current\nMonthly Revenue', 'Revenue\nAt Risk', 'Potential\nSavings', 'Net Benefit\n(Lifetime)']
values = [high_risk_monthly_revenue, 
          churners_in_high_risk['monthly_charges'].sum(),
          monthly_revenue_saved,
          net_benefit / avg_customer_lifetime_months]

colors = ['lightblue', 'salmon', 'lightgreen', 'gold']
bars = ax2.bar(categories, values, color=colors, alpha=0.8)
ax2.set_ylabel('Revenue ($)')
ax2.set_title('Revenue Impact Analysis - High-Risk Customers')
ax2.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, values):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'${value:,.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
# Create comprehensive results dataset for export
print("CREATING EXPORT DATASET")
print("="*30)

# Generate predictions for the entire dataset
X_all_scaled = X_scaled
y_pred_all = best_xgb_model.predict(X_all_scaled)
y_pred_proba_all = best_xgb_model.predict_proba(X_all_scaled)[:, 1]

# Create export dataset
export_data = telecom_data.copy()
export_data['churn_probability'] = y_pred_proba_all
export_data['predicted_churn'] = y_pred_all
export_data['risk_level'] = pd.cut(y_pred_proba_all, 
                                  bins=[0, 0.3, 0.7, 1.0], 
                                  labels=['Low', 'Medium', 'High'])

# Add model confidence and feature insights
export_data['model_confidence'] = np.where(
    (y_pred_proba_all < 0.2) | (y_pred_proba_all > 0.8), 'High', 
    np.where((y_pred_proba_all < 0.35) | (y_pred_proba_all > 0.65), 'Medium', 'Low')
)

# Add business metrics
export_data['ltv_at_risk'] = export_data['monthly_charges'] * 24 * export_data['churn_probability']
export_data['intervention_priority'] = export_data['churn_probability'].rank(ascending=False, method='dense')

print(f"Export dataset shape: {export_data.shape}")
print(f"Columns in export dataset: {list(export_data.columns)}")
print()

# Summary statistics for export
print("EXPORT DATASET SUMMARY")
print("-" * 25)
print("Risk Level Distribution:")
print(export_data['risk_level'].value_counts())
print()
print("Model Confidence Distribution:")
print(export_data['model_confidence'].value_counts())
print()

# Export to CSV
export_filename = 'telecom_churn_predictions.csv'
export_data.to_csv(export_filename, index=False)
print(f"Dataset exported to: {export_filename}")
print(f"File size: {len(export_data)} rows, {len(export_data.columns)} columns")
print()

# Create summary report
summary_stats = {
    'total_customers': len(export_data),
    'overall_churn_rate': export_data['churn'].mean(),
    'avg_churn_probability': export_data['churn_probability'].mean(),
    'high_risk_customers': (export_data['risk_level'] == 'High').sum(),
    'high_confidence_predictions': (export_data['model_confidence'] == 'High').sum(),
    'total_monthly_revenue': export_data['monthly_charges'].sum(),
    'revenue_at_risk': export_data['ltv_at_risk'].sum(),
    'model_accuracy': xgb_accuracy,
    'model_auc': xgb_auc
}

print("FINAL SUMMARY REPORT")
print("="*30)
for key, value in summary_stats.items():
    if isinstance(value, float):
        if 'rate' in key or 'probability' in key or 'accuracy' in key or 'auc' in key:
            print(f"{key.replace('_', ' ').title()}: {value:.1%}")
        else:
            print(f"{key.replace('_', ' ').title()}: ${value:,.2f}")
    else:
        print(f"{key.replace('_', ' ').title()}: {value:,}")
        
print()
print("DASHBOARD INTEGRATION READY")
print("The exported CSV file contains all necessary fields for:")
print("- Customer risk segmentation")
print("- Intervention prioritization") 
print("- Revenue impact analysis")
print("- Model performance monitoring")

# Display sample of high-risk customers for verification
print("\nSAMPLE HIGH-RISK CUSTOMERS:")
high_risk_sample = export_data[export_data['risk_level'] == 'High'][
    ['customer_id', 'churn_probability', 'monthly_charges', 'contract_type', 
     'tenure_months', 'ltv_at_risk']].head(10)
print(high_risk_sample.to_string(index=False))
