# Telecom Customer Churn Prediction - Exploratory Data Analysis

## Project Overview
This notebook performs comprehensive exploratory data analysis (EDA) on telecom customer churn data.

### Objectives:
1. Load and explore the dataset
2. Analyze data distributions
3. Identify patterns and correlations
4. Visualize key insights
5. Prepare data for modeling

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load Dataset

Load the telecom churn dataset from the data folder.

In [None]:
# Load data - Update the path based on your file location
# For CSV file:
# df = pd.read_csv('../data/telecom_churn.csv')

# For Excel file:
# df = pd.read_excel(r'C:\Users\lssan\Downloads\P585 Churn.xlsx')

# Placeholder - update with your actual file path
data_path = '../data/telecom_churn.csv'  # Change this to your file path

try:
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    elif data_path.endswith(('.xlsx', '.xls')):
        df = pd.read_excel(data_path)
    print(f"✓ Data loaded successfully!")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("⚠ File not found. Please update the data_path variable with your file location.")
    df = None

## 2. Initial Data Exploration

In [None]:
# Display first few rows
if df is not None:
    print("First 5 rows of the dataset:")
    display(df.head())

In [None]:
# Dataset information
if df is not None:
    print("Dataset Information:")
    print(f"\nNumber of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"\nColumn names: {list(df.columns)}")
    print("\nData types:")
    print(df.dtypes)

In [None]:
# Statistical summary
if df is not None:
    print("Statistical Summary:")
    display(df.describe())

In [None]:
# Check for missing values
if df is not None:
    print("Missing Values Analysis:")
    missing = df.isnull().sum()
    missing_percent = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing_Count': missing,
        'Percentage': missing_percent
    })
    print(missing_df[missing_df['Missing_Count'] > 0])
    
    if missing.sum() == 0:
        print("\n✓ No missing values found!")

In [None]:
# Check for duplicates
if df is not None:
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    
    if duplicates > 0:
        print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

## 3. Target Variable Analysis

Analyze the distribution of the churn variable (target).

In [None]:
# Target variable distribution
if df is not None:
    # Update 'Churn' with your actual target column name
    target_col = 'Churn'  # Change this if your target column has a different name
    
    if target_col in df.columns:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Count plot
        churn_counts = df[target_col].value_counts()
        axes[0].bar(churn_counts.index.astype(str), churn_counts.values, 
                   color=['#2ecc71', '#e74c3c'])
        axes[0].set_title(f'{target_col} Distribution', fontsize=14, fontweight='bold')
        axes[0].set_xlabel(target_col)
        axes[0].set_ylabel('Count')
        axes[0].grid(axis='y', alpha=0.3)
        
        # Add value labels
        for i, v in enumerate(churn_counts.values):
            axes[0].text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')
        
        # Pie chart
        axes[1].pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%',
                   colors=['#2ecc71', '#e74c3c'], startangle=90)
        axes[1].set_title(f'{target_col} Percentage', fontsize=14, fontweight='bold')
        
        plt.tight_layout()
        plt.show()
        
        print(f"\n{target_col} Distribution:")
        for label, count in churn_counts.items():
            percentage = (count / len(df)) * 100
            print(f"  {label}: {count} ({percentage:.2f}%)")
    else:
        print(f"⚠ Column '{target_col}' not found in dataset")

## 4. Numerical Features Analysis

In [None]:
# Identify numerical columns
if df is not None:
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    print(f"Numerical columns ({len(numerical_cols)}):")
    for col in numerical_cols:
        print(f"  - {col}")

In [None]:
# Distribution of numerical features
if df is not None and numerical_cols:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.ravel() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(numerical_cols):
        axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(axis='y', alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

## 5. Categorical Features Analysis

In [None]:
# Identify categorical columns
if df is not None:
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    print(f"Categorical columns ({len(categorical_cols)}):")
    for col in categorical_cols:
        print(f"  - {col}: {df[col].nunique()} unique values")

In [None]:
# Distribution of categorical features
if df is not None and categorical_cols:
    # Remove target column if present
    cat_cols_to_plot = [col for col in categorical_cols if col != target_col]
    
    if cat_cols_to_plot:
        n_cols = 2
        n_rows = (len(cat_cols_to_plot) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
        axes = axes.ravel() if n_rows > 1 else [axes] if n_cols == 1 else axes
        
        for idx, col in enumerate(cat_cols_to_plot):
            df[col].value_counts().plot(kind='bar', ax=axes[idx], color='skyblue')
            axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Count')
            axes[idx].grid(axis='y', alpha=0.3)
            plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45, ha='right')
        
        # Hide empty subplots
        for idx in range(len(cat_cols_to_plot), len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
if df is not None and numerical_cols:
    plt.figure(figsize=(12, 10))
    corr_matrix = df[numerical_cols].corr()
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
               center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()

## 7. Churn Analysis by Features

Analyze how different features relate to customer churn.

In [None]:
# Categorical features vs Churn
if df is not None and target_col in df.columns:
    cat_cols_to_analyze = [col for col in categorical_cols if col != target_col]
    
    if cat_cols_to_analyze:
        n_cols = 2
        n_rows = (len(cat_cols_to_analyze) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
        axes = axes.ravel() if n_rows > 1 else [axes] if n_cols == 1 else axes
        
        for idx, col in enumerate(cat_cols_to_analyze):
            churn_data = df.groupby([col, target_col]).size().unstack(fill_value=0)
            churn_data.plot(kind='bar', ax=axes[idx], color=['#2ecc71', '#e74c3c'])
            axes[idx].set_title(f'{col} vs {target_col}', fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Count')
            axes[idx].legend(title=target_col)
            axes[idx].grid(axis='y', alpha=0.3)
            plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45, ha='right')
        
        # Hide empty subplots
        for idx in range(len(cat_cols_to_analyze), len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()

In [None]:
# Numerical features vs Churn (Box plots)
if df is not None and target_col in df.columns and numerical_cols:
    # Remove target if it's numerical
    num_cols_to_plot = [col for col in numerical_cols if col != target_col]
    
    if num_cols_to_plot:
        n_cols = 3
        n_rows = (len(num_cols_to_plot) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
        axes = axes.ravel() if n_rows > 1 else [axes] if n_cols == 1 else axes
        
        for idx, col in enumerate(num_cols_to_plot):
            df.boxplot(column=col, by=target_col, ax=axes[idx])
            axes[idx].set_title(f'{col} by {target_col}', fontweight='bold')
            axes[idx].set_xlabel(target_col)
            axes[idx].set_ylabel(col)
        
        # Hide empty subplots
        for idx in range(len(num_cols_to_plot), len(axes)):
            axes[idx].axis('off')
        
        plt.suptitle('')
        plt.tight_layout()
        plt.show()

## 8. Key Insights Summary

Document key findings from the EDA.

In [None]:
# Summary of key insights
if df is not None:
    print("="*70)
    print(" "*20 + "KEY INSIGHTS SUMMARY")
    print("="*70)
    
    print(f"\n1. Dataset Overview:")
    print(f"   - Total customers: {len(df):,}")
    print(f"   - Total features: {df.shape[1]}")
    print(f"   - Numerical features: {len(numerical_cols)}")
    print(f"   - Categorical features: {len(categorical_cols)}")
    
    if target_col in df.columns:
        print(f"\n2. Churn Distribution:")
        churn_counts = df[target_col].value_counts()
        for label, count in churn_counts.items():
            percentage = (count / len(df)) * 100
            print(f"   - {label}: {count:,} ({percentage:.2f}%)")
    
    print(f"\n3. Data Quality:")
    missing_total = df.isnull().sum().sum()
    print(f"   - Missing values: {missing_total}")
    print(f"   - Duplicate rows: {df.duplicated().sum()}")
    
    print("\n" + "="*70)
    print("\n✓ EDA completed! Ready for preprocessing and modeling.")

## Next Steps

1. **Data Preprocessing**: Clean and prepare data for modeling
2. **Feature Engineering**: Create new features if needed
3. **Model Building**: Train multiple ML models
4. **Model Evaluation**: Compare model performance
5. **Deployment**: Create Streamlit app for predictions