In [10]:
# Importing useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple

# Configure visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


def load_data(filepath: str) -> pd.DataFrame:
    """ Load the Telco Customer Churn dataset form a CSV file.

    Args:
        filepath (str): Path to the CSV file

    Returns:
        pd.DataFrame: Loaded dataframe
    """
    df = pd.read_csv(filepath)
    return df


def checking_missing_values(df: pd.DataFrame) -> pd.Series:
    """Check for missing values in the dataframe.

    Args:
        df (pd.DataFrame): Input dataframe.

    Returns:
        pd.Series: Number of missing values per column.
    """
    missing = df.isnull().sum()
    return missing[missing > 0]


def data_overview(df: pd.DataFrame) -> None:
    """Print general information about the dataset

    Args:
        df (pd.DataFrame): Input dataframe;
    """
    print("----- Dataset Overview -----")
    print(f"Shape: {df.shape}")
    print("\nData types:")
    print(df.dtypes)
    print("\nSample data:")
    print(df.head())
    print("\nSummaty statistics:")
    print(df.describe(include="all"))


def plot_churn_distribution(df: pd.DataFrame) -> None:
    """Plot the distribution of churned vs non-churned customers.

    Args:
        df (pd.DataFrame): Input dataframe.
    """
    sns.countplot(x='churn', data=df, palette='Set2')
    plt.title("Customer Churn Distribution")
    plt.xlabel("Churn")
    plt.ylabel("Number of Customers")
    plt.show()


def plot_numeric_feature_distribution(df: pd.DataFrame, feature: str, hue: str = "churn") -> None:
    """Plt distribution of a numeric feature group by churn.

    Args:
        df (pd.DataFrame): Input dataframe.
        feature (str): Numeric feature to plot
        hue (str, optional): Categorical column to group (by Defaults to "churn").
    """
    sns.histplot(data=df, x=feature, hue=hue, multiple="stack", kde=True)
    plt.title(f'Distrution of {feature} grouped by {hue}')
    plt.show()


def plot_categorical_feature_distribution(df: pd.DataFrame, feature: str, hue: str = 'Churn') -> None:
    """Plot distribution of a categorical feature grouped by churn.

    Args:
        df (pd.DataFrame): Input dataframe
        feature (str): Categorical feature to plot
        hue (str, optional): Categorical column. (Defaults to 'Churn').
    """
    sns.countplot(x=feature, hue=hue, data=df, palette="Set1")
    plt.title(f'Distribution of {feature} grouped by {hue}')
    plt.xticks(rotation=45)
    plt.show()

In [11]:
# Main Execution
if __name__ == "__main__":
    data_path: str = "../data/raw/Telco-Customer-Churn.csv"

    # Load data
    df: pd.DataFrame = load_data(data_path)
 
    # Initial overview
    data_overview(df)

    # Check for missing values
    missing_vals: pd.Series = checking_missing_values(df)
    if missing_vals.empty:
        print("\nMissing values:")
        print(missing_vals)

    # Plot numeric features distribution
    numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    for feature in numeric_features:
        # TotalCharge has missing spaces that might convert it to object, so fix it
        if feature == "TotalCharges" and df[feature].dtype == 'object':
            df[feature] = pd.to_numberic(df[feature], errors='coerce')
        plot_numeric_feature_distribution(df, feature)

    # Plot some categorical features
    categorical_features = ['Contract', 'PaymentMethod', 'InternetSErvice']
    for feature in categorical_features:
        plot_categorical_feature_distribution(df, feature)
    

----- Dataset Overview -----
Shape: (7043, 21)

Data types:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Sample data:


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

ValueError: Could not interpret value `churn` for `hue`. An entry with this name does not appear in `data`.