In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

ModuleNotFoundError: No module named 'pandas_profiling'

In [11]:
df = pd.read_csv(r"../data/raw/splits/train.csv")
profile = ProfileReport(df, title="Customer Churn Data Profiling", explorative=True)
profile.to_file(r"../reports/churn_data_report.html")
df.head()

NameError: name 'ProfileReport' is not defined

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.Churn.value_counts()

In [None]:
def fix_total_charges(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts TotalCharges to numeric and fills NaN values 
    (which arise from empty strings) with 0.
    """
    # Force convert to numeric, errors become NaN
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    
    # Check how many failed
    num_missing = df['TotalCharges'].isnull().sum()
    print(f"⚠️ Found {num_missing} non-numeric values in TotalCharges. Filling with 0.")
    
    # Fill with 0 (assuming tenure=0 implies no charges yet)
    df['TotalCharges'] = df['TotalCharges'].fillna(0)
    return df

# Apply it
df = fix_total_charges(df)

In [None]:
def plot_target_distribution(df, target_col='Churn'):
    plt.figure(figsize=(6, 4))
    ax = sns.countplot(x=target_col, data=df, palette='viridis')
    
    # Add percentages on bars
    total = len(df)
    for p in ax.patches:
        percentage = f'{100 * p.get_height() / total:.1f}%'
        x = p.get_x() + p.get_width() / 2
        y = p.get_height()
        ax.annotate(percentage, (x, y), ha='center', va='bottom')
    
    plt.title(f'Distribution of {target_col}')
    plt.show()

plot_target_distribution(df)

In [None]:
def analyze_numerical_features(df, numerical_cols, target='Churn'):
    fig, axes = plt.subplots(len(numerical_cols), 2, figsize=(14, 5 * len(numerical_cols)))
    
    for i, col in enumerate(numerical_cols):
        # Plot 1: Distribution (Histogram/KDE)
        sns.histplot(data=df, x=col, hue=target, kde=True, element="step", ax=axes[i, 0])
        axes[i, 0].set_title(f'Distribution of {col} by {target}')
        
        # Plot 2: Boxplot (To see outliers and median differences)
        sns.boxplot(data=df, x=target, y=col, ax=axes[i, 1])
        axes[i, 1].set_title(f'{col} vs {target}')
        
    plt.tight_layout()
    plt.show()

num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
analyze_numerical_features(df, num_cols)

In [None]:
def analyze_categorical_features(df, categorical_cols, target='Churn'):
    # Calculate grid size needed
    n_cols = 3
    n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
    
    plt.figure(figsize=(20, 5 * n_rows))
    
    for i, col in enumerate(categorical_cols):
        plt.subplot(n_rows, n_cols, i + 1)
        
        # Plot countplot with hue=Churn
        sns.countplot(x=col, hue=target, data=df, palette='pastel')
        plt.title(f'{col} Distribution')
        plt.xticks(rotation=45)
        plt.legend(title=target, loc='upper right')
        
    plt.tight_layout()
    plt.show()

# Define your categoricals (excluding ID and numericals)
cat_cols = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]

analyze_categorical_features(df, cat_cols)