In [None]:
import os
import json
from pathlib import Path
import warnings
from typing import Dict, Any, Optional, Tuple

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# Configuration
RANDOM_STATE = 42
TEST_SIZE = 0.2
PLOT_CONFIG = {
    'colors': {
        'primary': '#2ecc71',    # Green for retained
        'secondary': '#e74c3c',  # Red for churned
        'neutral': '#3498db'     # Blue for general
    },
    'template': 'plotly_white',
    'dimensions': {'width': 1000, 'height': 600}
}

class ChurnAnalyzer:
    """
    A comprehensive analyzer for bank customer churn data.
    """
    
    def __init__(self, file_path: str):
        """Initialize the analyzer with data from file."""
        self.file_path = file_path
        self.df = None
        self.insights = {}
        
        # Load and validate data
        self._load_data()
    
    def _load_data(self) -> None:
        """Load and validate the bank churn data."""
        try:
            # File validation
            file = Path(self.file_path)
            if not file.exists():
                raise FileNotFoundError(f"File not found at: {self.file_path}")
            if file.suffix.lower() != '.csv':
                raise ValueError(f"File must be CSV, got: {file.suffix}")
            
            # Load data
            print("📂 Loading data...")
            self.df = pd.read_csv(file)
            
            # Validate columns
            required_cols = {
                'CustomerId', 'CreditScore', 'Geography', 'Gender',
                'Age', 'Tenure', 'Balance', 'NumOfProducts',
                'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited'
            }
            missing_cols = required_cols - set(self.df.columns)
            if missing_cols:
                raise ValueError(f"Missing columns: {missing_cols}")
            
            print(f"✅ Loaded {len(self.df):,} records")
            
        except Exception as e:
            print(f"❌ Error loading data: {str(e)}")
            raise
    
    def analyze_demographics(self) -> Dict[str, Any]:
        """Analyze customer demographics."""
        print("\n🔍 Analyzing Demographics...")
        
        demographics = {
            'total_customers': len(self.df),
            'churn_rate': self.df['Exited'].mean() * 100,
            'age_stats': {
                'mean': self.df['Age'].mean(),
                'median': self.df['Age'].median(),
                'std': self.df['Age'].std()
            },
            'balance_stats': {
                'mean': self.df['Balance'].mean(),
                'median': self.df['Balance'].median(),
                'zero_balance_pct': (self.df['Balance'] == 0).mean() * 100
            },
            'geographic_dist': self.df['Geography'].value_counts(normalize=True).to_dict(),
            'gender_dist': self.df['Gender'].value_counts(normalize=True).to_dict()
        }
        
        # Visualize age distribution
        fig = px.histogram(
            self.df,
            x='Age',
            color='Exited',
            marginal='box',
            nbins=25,
            title='Age Distribution by Churn Status',
            color_discrete_map={
                0: PLOT_CONFIG['colors']['primary'],
                1: PLOT_CONFIG['colors']['secondary']
            },
            template=PLOT_CONFIG['template']
        )
        fig.show()
        
        self.insights['demographics'] = demographics
        return demographics
    
    def analyze_geography(self) -> Dict[str, Any]:
        """Analyze geographic patterns."""
        print("\n🌍 Analyzing Geographic Patterns...")
        
        geo_insights = {
            'churn_by_country': self.df.groupby('Geography')['Exited'].mean().to_dict(),
            'avg_balance': self.df.groupby('Geography')['Balance'].mean().to_dict(),
            'avg_credit_score': self.df.groupby('Geography')['CreditScore'].mean().to_dict()
        }
        
        # Visualize geographic churn rates
        geo_data = pd.DataFrame(geo_insights['churn_by_country'].items(), 
                              columns=['Country', 'Churn_Rate'])
        geo_data['Churn_Rate'] *= 100
        
        fig = px.bar(
            geo_data,
            x='Country',
            y='Churn_Rate',
            title='Churn Rate by Country',
            color='Churn_Rate',
            text='Churn_Rate',
            template=PLOT_CONFIG['template']
        )
        fig.show()
        
        self.insights['geography'] = geo_insights
        return geo_insights
    
    def build_predictive_model(self) -> Dict[str, Any]:
        """Build and evaluate a predictive model for churn."""
        print("\n🤖 Building Predictive Model...")
        
        # Prepare features and target
        features = [
            'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
            'HasCrCard', 'IsActiveMember', 'Geography', 'Gender'
        ]
        X = self.df[features]
        y = self.df['Exited']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
        )
        
        # Create preprocessing pipeline
        numeric_features = [
            'CreditScore', 'Age', 'Tenure', 'Balance', 
            'NumOfProducts', 'HasCrCard', 'IsActiveMember'
        ]
        categorical_features = ['Geography', 'Gender']
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(drop='first', sparse_output=False), 
                 categorical_features)
            ]
        )
        
        # Create and train model
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
        ])
        
        model.fit(X_train, y_train)
        
        # Evaluate model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        model_insights = {
            'roc_auc': roc_auc_score(y_test, y_pred_proba),
            'classification_report': classification_report(y_test, y_pred, 
                                                        output_dict=True),
            'feature_importance': dict(zip(features, 
                model.named_steps['classifier'].feature_importances_))
        }
        
        # Visualize confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            confusion_matrix(y_test, y_pred),
            annot=True,
            fmt='d',
            cmap='Blues'
        )
        plt.title('Confusion Matrix')
        plt.show()
        
        self.insights['model'] = model_insights
        return model_insights
    
    def save_insights(self, output_path: str) -> None:
        """Save analysis insights to JSON file."""
        try:
            with open(output_path, 'w') as f:
                json.dump(self.insights, f, indent=4)
            print(f"\n✅ Insights saved to {output_path}")
        except Exception as e:
            print(f"❌ Error saving insights: {str(e)}")

def main():
    """Main execution pipeline."""
    try:
        # Initialize analyzer
        analyzer = ChurnAnalyzer("Bank_Churn.csv")
        
        # Run analyses
        analyzer.analyze_demographics()
        analyzer.analyze_geography()
        analyzer.build_predictive_model()
        
        # Save results
        analyzer.save_insights("churn_analysis_results.json")
        
    except Exception as e:
        print(f"❌ Analysis failed: {str(e)}")

if __name__ == "__main__":
    main()

: 