In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from tensorflow import keras

df = pd.read_csv('telco_processed.csv')
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

dt_model = joblib.load('decision_tree_model.pkl')
nn_model = keras.models.load_model('neural_network_model.h5')

print("Both models loaded successfully")



Both models loaded successfully


In [3]:
y_pred_dt = dt_model.predict(X_test)
y_pred_proba_dt = dt_model.predict_proba(X_test)[:, 1]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_pred_proba_nn = nn_model.predict(X_test_scaled)
y_pred_nn = (y_pred_proba_nn > 0.5).astype(int).flatten()

ValueError: could not convert string to float: '0-1yr'

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

metrics_comparison = []

for name, y_pred, y_pred_proba in [('Decision Tree', y_pred_dt, y_pred_proba_dt),
                                   ('Neural Network', y_pred_nn, y_pred_proba_nn.flatten())]:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = auc(*roc_curve(y_test, y_pred_proba)[:2])

    metrics_comparison.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc_score
    })

comparison_df = pd.DataFrame(metrics_comparison)
print("=== MODEL COMPARISON ===")
print(comparison_df.round(4))

In [None]:
plt.figure(figsize=(10, 8))

fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_proba_dt)
roc_auc_dt = auc(fpr_dt, tpr_dt)

fpr_nn, tpr_nn, _ = roc_curve(y_test, y_pred_proba_nn)
roc_auc_nn = auc(fpr_nn, tpr_nn)

plt.plot(fpr_dt, tpr_dt, color='blue', lw=2, label=f'Decision Tree (AUC = {roc_auc_dt:.4f})')
plt.plot(fpr_nn, tpr_nn, color='red', lw=2, label=f'Neural Network (AUC = {roc_auc_nn:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
comparison_df_melted = comparison_df.melt(id_vars=['Model'], value_vars=metrics_to_plot,
                                         var_name='Metric', value_name='Score')

plt.figure(figsize=(12, 6))
sns.barplot(data=comparison_df_melted, x='Metric', y='Score', hue='Model')
plt.title('Model Performance Comparison Across Metrics')
plt.ylim(0, 1)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.utils import resample
from scipy import stats

def bootstrap_accuracy(y_true, y_pred, n_bootstraps=1000):
    accuracies = []
    for _ in range(n_bootstraps):
        indices = resample(range(len(y_true)))
        acc = accuracy_score(y_true.iloc[indices], y_pred[indices])
        accuracies.append(acc)
    return np.array(accuracies)

dt_accuracies = bootstrap_accuracy(y_test, y_pred_dt)
nn_accuracies = bootstrap_accuracy(y_test, y_pred_nn)

t_stat, p_value = stats.ttest_rel(dt_accuracies, nn_accuracies)

print("=== STATISTICAL SIGNIFICANCE TEST ===")
print(f"Decision Tree Mean Accuracy: {dt_accuracies.mean():.4f}")
print(f"Neural Network Mean Accuracy: {nn_accuracies.mean():.4f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Significant difference (p < 0.05): {p_value < 0.05}")

In [None]:
comparison_results = {
    'metrics_comparison': comparison_df.to_dict(),
    'statistical_test': {
        't_statistic': t_stat,
        'p_value': p_value,
        'significant_difference': p_value < 0.05
    },
    'best_model': comparison_df.loc[comparison_df['Accuracy'].idxmax(), 'Model']
}

print("=== BEST PERFORMING MODEL ===")
print(f"{comparison_results['best_model']} with accuracy: {comparison_df['Accuracy'].max():.4f}")

In [None]:
dt_feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance_dt': dt_model.feature_importances_
}).sort_values('importance_dt', ascending=False)

top_features_combined = dt_feature_importance.head(10)

plt.figure(figsize=(12, 8))
plt.barh(top_features_combined['feature'], top_features_combined['importance_dt'])
plt.xlabel('Feature Importance Score')
plt.title('Top 10 Most Important Features (Decision Tree)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("=== TOP 5 FEATURES AFFECTING CHURN ===")
for i, row in top_features_combined.head().iterrows():
    print(f"{i+1}. {row['feature']}: {row['importance_dt']:.4f}")

In [None]:
errors_dt = y_test != y_pred_dt
errors_nn = y_test != y_pred_nn

error_analysis = pd.DataFrame({
    'Actual': y_test,
    'Predicted_DT': y_pred_dt,
    'Predicted_NN': y_pred_nn,
    'Error_DT': errors_dt,
    'Error_NN': errors_nn
})

print("=== ERROR ANALYSIS ===")
print("Decision Tree Errors:", errors_dt.sum())
print("Neural Network Errors:", errors_nn.sum())
print("Common Errors (both wrong):", (errors_dt & errors_nn).sum())

false_positives_dt = ((y_test == 0) & (y_pred_dt == 1)).sum()
false_negatives_dt = ((y_test == 1) & (y_pred_dt == 0)).sum()

false_positives_nn = ((y_test == 0) & (y_pred_nn == 1)).sum()
false_negatives_nn = ((y_test == 1) & (y_pred_nn == 0)).sum()

print(f"\nDecision Tree - FP: {false_positives_dt}, FN: {false_negatives_dt}")
print(f"Neural Network - FP: {false_positives_nn}, FN: {false_negatives_nn}")

In [None]:
avg_monthly_charge = df['MonthlyCharges'].mean()
customers_at_risk = len(y_test[y_test == 1])

print("=== BUSINESS IMPACT ANALYSIS ===")
print(f"Average Monthly Charge: ${avg_monthly_charge:.2f}")
print(f"Customers at risk in test set: {customers_at_risk}")

correctly_identified_dt = ((y_test == 1) & (y_pred_dt == 1)).sum()
correctly_identified_nn = ((y_test == 1) & (y_pred_nn == 1)).sum()

revenue_saved_dt = correctly_identified_dt * avg_monthly_charge * 6  # 6 months retention
revenue_saved_nn = correctly_identified_nn * avg_monthly_charge * 6

print(f"\nRevenue potentially saved (6 months retention):")
print(f"Decision Tree: ${revenue_saved_dt:.2f}")
print(f"Neural Network: ${revenue_saved_nn:.2f}")

In [None]:
print("=== MODEL STRENGTHS AND WEAKNESSES ===")
print("\nDecision Tree:")
print("✓ Interpretable and explainable")
print("✓ Handles non-linear relationships well")
print("✓ No feature scaling required")
print("✗ Prone to overfitting")
print("✗ Can be unstable with small data changes")

print("\nNeural Network:")
print("✓ Handles complex patterns well")
print("✓ Good generalization with proper regularization")
print("✓ Can learn non-linear relationships")
print("✗ Black box - hard to interpret")
print("✗ Requires feature scaling")
print("✗ Computationally expensive")

In [None]:
print("=== RECOMMENDATIONS ===")
print("1. For interpretability: Use Decision Tree")
print("2. For maximum accuracy: Use Neural Network")
print("3. Key factors to address churn:")
for i, row in top_features_combined.head(3).iterrows():
    print(f"   - Focus on {row['feature']}")
print("4. Implement proactive retention strategies for high-risk customers")
print("5. Regular model retraining with new data")

In [None]:
comprehensive_analysis = {
    'top_features': top_features_combined.head(10).to_dict(),
    'error_analysis': {
        'dt_errors': errors_dt.sum(),
        'nn_errors': errors_nn.sum(),
        'common_errors': (errors_dt & errors_nn).sum()
    },
    'business_impact': {
        'revenue_saved_dt': revenue_saved_dt,
        'revenue_saved_nn': revenue_saved_nn
    },
    'recommendations': [
        "Use Decision Tree for interpretability",
        "Use Neural Network for maximum accuracy",
        f"Focus on {top_features_combined.iloc[0]['feature']} for churn reduction"
    ]
}

print("Comprehensive analysis completed and saved")