# Customer Churn Prediction

This project will be about developing a ML model that predicts the customer churn of a company. I will use the Telco Customer Churn dataset. 

## Library imports.

In [26]:
# Main libraries for data manipulation.
import pandas as pd
import numpy as np          


## Dataset load.

In [4]:
df = pd.read_csv('../data/TelcoCustomerChurn.csv')
print(f'Dataframe created succesfully! \nIt has {df.shape[0]} observartions and {df.shape[1]} features.')
# Displaying the first 5 rows.
df.head()  

Dataframe created succesfully! 
It has 7043 observartions and 21 features.


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Preprocessing

I will start by looking for some information about the dataset, such as null values, outliers, features... Based on that info i will make changes or not.

### First Inspection

First, i want to know the basic structure of the dataset.

In [6]:
def basic_info(df):
    """Displaying basic info about the dataset"""
    print("\n BASIC INFO ABOUT THE DATASET")
    print("-" * 40)
    print(f"Dimensions: {df.shape[0]} rows x {df.shape[1]} columns")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print("\n  DATA TYPES:")
    print(df.dtypes.value_counts())
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    print(f'\n COLUMNS:')
    print(f'Object columns: {categorical_cols}')
    print(f'Numerical columns: {numeric_cols}')
    
    print("\n FIRST 5 ROWS:")
    return df.head()

In [7]:
basic_info(df)


 BASIC INFO ABOUT THE DATASET
----------------------------------------
Dimensions: 7043 rows x 21 columns
Memory usage: 6.82 MB

  DATA TYPES:
object     18
int64       2
float64     1
Name: count, dtype: int64

 COLUMNS:
Object columns: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']
Numerical columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges']

 FIRST 5 ROWS:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Missing data detection & manipulation

I will continue by checking if there are any missing data on the dataset.

In [8]:
def analyze_missing_data(df):
    """Detailed analysis about null values"""
    print("\n MISSING DATA ANALYSIS")
    print("-" * 40)
    
    # Searching for missing data, count & percentage.
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    
    # Creating a df with the missing data info.
    missing_df = pd.DataFrame({
        'Column': missing_data.index,
        'Missing_Values': missing_data.values,
        'Percentage': missing_percent.values
    })
    
    # Droping non missing values & sorting values.
    missing_df = missing_df[missing_df['Missing_Values'] > 0].sort_values('Missing_Values', ascending=False)
    
    # Showing the missing data, if it exists.
    if len(missing_df) > 0:
        print(missing_df)
    else:
        print("There are not missing values on the dataset!")
    
    return missing_df

In [9]:
missing_analysis = analyze_missing_data(df)


 MISSING DATA ANALYSIS
----------------------------------------
There are not missing values on the dataset!


### Detecting common data issues.

Identifying common data issues of datasets, like string spaces, duplicates, columns with only one value, impossible values...

In [10]:
def detect_data_issues(df):
    """Detects common data on the dataset."""
    print("\n COMMON ISSUES DETECTING")
    print("-" * 40)
    
    issues = []
    
    # Detecting ' ' values that should be NaN.
    for col in df.select_dtypes(include=['object']).columns:
        if df[col].eq(' ').any():
            count = df[col].eq(' ').sum()
            issues.append(f"'{col}': {count} values are empty spaces.")
            print(f"{col}: {count} values are empty spaces.")
    
    # Detecting duplicates.
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        issues.append(f"Duplicated observations (rows): {duplicates}")
        print(f"{duplicates} rows are duplicated.")
    
    # Detecting columns with only one value.
    for col in df.columns:
        if df[col].nunique() == 1:
            issues.append(f"'{col}': only has an unique value.")
            print(f"{col}: only has an unique value.")
    
    if not issues:
        print("No common issues were detected on the dataset!")
    
    return issues

In [11]:
data_issues = detect_data_issues(df)


 COMMON ISSUES DETECTING
----------------------------------------
TotalCharges: 11 values are empty spaces.


### Data Cleansing

With the issues detected, I will do a little preprocess to take out some small problems.

In [12]:
def clean_data(df):
    """ Cleansing & preprocessing the dataset."""
    print(f"\n DATA CLEANING")
    print(f"-"*40)
    
    # Creating a copy of the df to preserve the original one.
    df_clean = df.copy()
    
    # Converting empty spaces to NaN.
    df_clean = df_clean.replace(' ', np.nan)
    df_clean = df_clean.replace('', np.nan)
    
    # Converting some object types to number.
    if 'TotalCharges' in df_clean.columns:
        df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')
        nan_count = df_clean['TotalCharges'].isna().sum()
        if nan_count > 0:
            print(f'TotalCharges: {nan_count} values converted to NaN.')
        print(f' {'TotalCharges'} converted to number!')
    
    # Handling other numerical columns that may have problems.
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            # Trying to convert to numerical.
            numeric_series = pd.to_numeric(df_clean[col], errors='coerce')
            # If the conversion was succesful to the mayority of values, then check the NaN.
            if numeric_series.notna().sum() > len(df_clean) * 0.5:
                nan_before = df_clean[col].isna().sum()
                df_clean[col] = numeric_series
                nan_after = df_clean[col].isna().sum()
                if nan_after > nan_before:
                    print(f"⚠️  {col}: {nan_after - nan_before} additional values converted to NaN.")
    
    # Removing duplicates values.
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    removed_duplicates = initial_rows - len(df_clean)
    if removed_duplicates > 0:
        print(f"{removed_duplicates} duplicated values have been removed!.")
    
    # Checking if there are binary categorical columns.
    binary_cols = []
    for col in df_clean.select_dtypes(include=['object']).columns:
        if df_clean[col].nunique() == 2:
            binary_cols.append(col)
    
    # If there are binary cols, changing to numerical.
    if binary_cols:
        print(f"Binary variables detected: {binary_cols}")
        print('Converting the binary colums to numerical...')
        for col in binary_cols:
            df_clean[col] = df_clean[col].map({df_clean[col].unique()[0] : 0,
                                               df_clean[col].unique()[1] : 1})
            print(f'{col} feature converted to numerical succesfully!')
        print(f'\n All binary columns have been converted succesfully!')
        
    # Showing a resume of NaN values after cleansing.
    total_nans = df_clean.isna().sum().sum()
    if total_nans > 0:
        print(f'Total NaN values after cleansing: {total_nans}')
    else:
        print(f'No NaN values exist after the cleansing!')
        
    return df_clean

In [13]:
df_clean = clean_data(df)


 DATA CLEANING
----------------------------------------
TotalCharges: 11 values converted to NaN.
 TotalCharges converted to number!
Binary variables detected: ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
Converting the binary colums to numerical...
gender feature converted to numerical succesfully!
Partner feature converted to numerical succesfully!
Dependents feature converted to numerical succesfully!
PhoneService feature converted to numerical succesfully!
PaperlessBilling feature converted to numerical succesfully!
Churn feature converted to numerical succesfully!

 All binary columns have been converted succesfully!
Total NaN values after cleansing: 11


I know that we have NaN values, I'll handle it further on.

### Univariate Analysis

Looking for a general descriptive analysis.

In [14]:
def univariate_analysis(df):
    """General Univariate Analysis"""
    print(f'\n UNIVARIATE ANALYSIS')
    print(f"-" * 40)
    
    # Selecting the features by their type, as it was done before.
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    print(f'Number of Numerical features: {len(numeric_cols)}')
    print(f'Numerical features: {numeric_cols}')
    print(f'Number of Categorical features: {len(categorical_cols)}')
    print(f'Categorical features: {categorical_cols}')
    
    # Mostramos estadísticas descriptivas.
    if numeric_cols:
        print("\n DESCRIPTIVE STATS - NUMERICAL FEATURES:")
        print(df[numeric_cols].describe().round(2))
    
    # Frecuencias para variables categóricas.
    print(f"\n CATEGORICAL FEATURES DISTRIBUTION:")
    for col in categorical_cols[:5]: # Un número corto para no saturar.
        print(f"\n{col}")
        print(df[col].value_counts())
    return numeric_cols, categorical_cols

In [15]:
numeric_cols, categorical_cols = univariate_analysis(df_clean)


 UNIVARIATE ANALYSIS
----------------------------------------
Number of Numerical features: 10
Numerical features: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn']
Number of Categorical features: 11
Categorical features: ['customerID', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

 DESCRIPTIVE STATS - NUMERICAL FEATURES:
       gender  SeniorCitizen  Partner  Dependents   tenure  PhoneService  \
count  7043.0        7043.00  7043.00     7043.00  7043.00        7043.0   
mean      0.5           0.16     0.52        0.30    32.37           0.9   
std       0.5           0.37     0.50        0.46    24.56           0.3   
min       0.0           0.00     0.00        0.00     0.00           0.0   
25%       0.0           0.00     0.00        0.00     9.00           1.0   
50% 

Some distributions are interesting. It will be useful later on.

### Target feature analysis

Doing a short research about the target, the feature "Churn".

In [16]:
def analyze_target_variable(df, target_col='Churn'):
    """Detailed analysis of the target."""
    print(f"\n TARGET ANALYSIS: {target_col}")
    print("-" * 40)
    
    # Checking that it exists.
    if target_col not in df.columns:
        print(f"Columna {target_col} no encontrada.")
        return None
    
    # Target distribution, count and percentage.
    target_counts = df[target_col].value_counts()
    target_props = df[target_col].value_counts(normalize=True) * 100
    
    print(f"{target_col} distribution:")
    for value, count in target_counts.items():
        prop = target_props[value]
        print(f" {value} : {count} ({prop:.1f}$)")
    
    # Detecting unbalance.
    minority_clase_prop = target_props.min()
    if minority_clase_prop < 30:
        print(f"Unbalanced dataset: Minority class = {minority_clase_prop:.1f}%")
    else:
        print(f"Dataset relatively balanced!")
    
    return target_counts, target_props


In [17]:
# In this case, we use the original dataset to see the original values.
target_analysis = analyze_target_variable(df)


 TARGET ANALYSIS: Churn
----------------------------------------
Churn distribution:
 No : 5174 (73.5$)
 Yes : 1869 (26.5$)
Unbalanced dataset: Minority class = 26.5%


Now I know that it exists a data unbalancing issue, that we will have to work on it further on.

### Correlation Analysis

Learning if there is a correlation between the independent features and the target.

In [18]:
def correlation_analysis(df, target_col='Churn'):
    """Correlation analysis"""
    print(f'\n CORRELATION ANALYSIS')
    print(f'-'*40)
    
    # Preparing the data for the correlation.
    df_corr = df.copy()
        
    # Correlation only between numerical variables.
    numeric_df = df_corr.select_dtypes(include=[np.number])
    
    # If there are numerical columns, continue.
    if len(numeric_df.columns) > 1:
        correlations = numeric_df.corr()
        
        # Strong correlations with the target (if it exists and its numerical).
        if target_col in correlations.columns:
            target_corr = correlations[target_col].drop(target_col).sort_values(key=abs, ascending=False)
            print(f"\n Correlaciones más fuertes con {target_col}:")
            print(target_corr.head(10).round(3))
        
        return correlations 
    
    return None 

In [19]:
correlations = correlation_analysis(df_clean)


 CORRELATION ANALYSIS
----------------------------------------

 Correlaciones más fuertes con Churn:
tenure             -0.352
TotalCharges       -0.199
MonthlyCharges      0.193
PaperlessBilling   -0.192
Dependents         -0.164
SeniorCitizen       0.151
Partner             0.150
PhoneService        0.012
gender             -0.009
Name: Churn, dtype: float64


### Outlier detection

Looking for non normal data on the dataset by using interquartilical range (IQR).

In [20]:
def detect_outliers(df, numeric_cols):
    """Detects outliers by using IQR."""
    print(f"\n OUTLIER DETECTION")
    print("-" * 40)
    
    outlier_summary = {}
    
    
    for col in numeric_cols:
        # Veryfing that the column exists and its data is valid.
        if col not in df.columns:
            continue
        # Removing NaN for the outlier calc.
        col_data = df[col].dropna()
        if len(col_data) == 0:
            print(f'{col}: There are no valid data to analyze')
            continue
        
        # Setting up Quantiles & limits.
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Avoiding division by zero.
        if IQR == 0:
            print(f'{col}: It does not have variability (IQR=0)')
            outlier_summary[col] = {
                'count': 0,
                'percentage': 0.0,
                'lower_bound': Q1,
                'upper_bound':Q3
            }
            continue
        lower_bound = Q1 - 1.5 * IQR 
        upper_bound = Q3 + 1.5 * IQR
        
        # Detecting outliers data in a safe way.
        outlier_mask = (col_data < lower_bound) | (col_data > upper_bound)
        outlier_count = outlier_mask.sum()
        outlier_percent = (outlier_count / len(col_data)) * 100
        
        outlier_summary[col] = {
            'count' : outlier_count,
            'percentaje' : outlier_percent,
            'lower_bound' : lower_bound,
            'upper_bound' : upper_bound
        }
        
        # Displaying the info.
        if outlier_count > 0:
            print(f"{col} : {outlier_count} outliers ({outlier_percent:.2f}%)")
        else:
            print(f'{col}: No outliers were detected!')
    
    return outlier_summary

In [21]:
outliers = detect_outliers(df_clean, numeric_cols)


 OUTLIER DETECTION
----------------------------------------
gender: No outliers were detected!
SeniorCitizen: It does not have variability (IQR=0)
Partner: No outliers were detected!
Dependents: No outliers were detected!
tenure: No outliers were detected!
PhoneService: It does not have variability (IQR=0)
PaperlessBilling: No outliers were detected!
MonthlyCharges: No outliers were detected!
TotalCharges: No outliers were detected!
Churn: No outliers were detected!


Looks like there aren't outliers on the dataset.

### Final Summary

The next function will show a short summary of the function developed before.

In [22]:
def summary(df, df_clean, missing_analysis, data_issues):
    """Resumen ejecutivo del análisis"""
    print(f'\n SUMMARY')
    print('-' * 40)
    
    # Dataset info.
    print(f'DATASET')
    print(f'    * Dimensions: {df.shape[0]} rows x {df.shape[1]} columns.')
    print(f'    * Numerical features: {len(df_clean.select_dtypes(include=[np.number]).columns)}')
    print(f'    * Categorical features: {len(df_clean.select_dtypes(include=['object']).columns)}')
    
    # Data quality.
    if len(missing_analysis) == 0:
        print(f'    + Empty values non existent!')
    else:
        print(f'    - {len(missing_analysis)} columns with empty values.')
        
    if len(data_issues) == 0:
        print(f"    + No issues were detected!")
    else:
        print(f'    - {len(data_issues)} issue was detected.')
    
    # Distribution of the target
    if 'Churn' in df_clean.columns:
        churn_rate = (df_clean['Churn'] == 'Yes').mean() * 100
        print(f'\n TARGET:')
        print(f'    * Churn rate: {churn_rate:.1f}%')
    
    if churn_rate < 30:
        print(f'    - Dataset unbalanced! - Consider applying balance techniques (SMOTE, Sinthetic data...)')
    else:
        print(f'    + Dataset balanced!')

In [23]:
summary(df, df_clean, missing_analysis, data_issues)


 SUMMARY
----------------------------------------
DATASET
    * Dimensions: 7043 rows x 21 columns.
    * Numerical features: 10
    * Categorical features: 11
    + Empty values non existent!
    - 1 issue was detected.

 TARGET:
    * Churn rate: 0.0%
    - Dataset unbalanced! - Consider applying balance techniques (SMOTE, Sinthetic data...)


### Saving dataframe

In [24]:
def save_cleaned_data(df_clean, filename='../data/telco_churn_cleaned.csv'):
    """Saving the clean dataset."""
    df_clean.to_csv(filename, index=False)
    print(f'\n Dataset saved succesfully! It was saved on:\n {filename}')
    return filename

In [25]:
save_cleaned_data(df_clean)


 Dataset saved succesfully! It was saved on:
 ../data/telco_churn_cleaned.csv


'../data/telco_churn_cleaned.csv'

Once the EDA (Exploratory Data Analysis) is done and I have all the information needed to make decisions, all the problems detected before will be fixed.

## EDA & Visualizations

As I said before, I'm doing the Exploratory Data Analysis to make some changes on the issues found on the dataset.

In [27]:
# Importing data visualization libraries.
import matplotlib.pyplot as plt 
import seaborn as sns  
from matplotlib.patches import Rectangle
import warnings
warnings.filterwarnings('ignore')

In [30]:
# Visual configuration.
plt.style.use('default')
sns.set_palette('husl')

# Colors configuration, they were taken from internet.
COLORS = {
    'primary': '#2E86AB',      # Corporative Blue
    'secondary': '#A23B72',    # Pink
    'accent': '#F18F01',       # Orange
    'success': '#C73E1D',      # Red
    'background': '#F5F5F5',   # Light gray
    'churn_yes': '#E74C3C',    # Red (Churn)
    'churn_no': '#27AE60'      # Green (No Churn)
}

# Global configuration
plt.rcParams.update({
    'figure.figsize': (15,8),
    'font.size': 11,
    'axes.titlesize': 14,
    'axes.labelsize':12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'figure.titlesize': 16
})

print(f'Configuration set succesfully!')

Configuration set succesfully!


### Overview

First, I'll start by showing some key metrics that will display important data.

In [39]:
def overview_dashboard(df):
    """Overview dashboard with key metrics."""
    print(f'Creating overview dashboard...')
    
    # Characteristics of the plot.
    fig = plt.figure(figsize=(20,12))
    gs = fig.add_gridspec(3,4, hspace=0.3, wspace=0.3)
    
    # Main title.
    fig.suptitle('CUSTOMER CHURN - OVERVIEW DASHBOARD - EXECUTIVE DASHBOARD',
                 fontsize=24, fontweight='bold', y=0.95)
    
    # 1. KPIs. Cards display.
    
    # Setting up KPIs.
    churn_rate = [df['Churn']=='Yes'].mean()*100
    avg_month_ch = df['MonthlyCharges'].mean()
    avg_tenure = df['tenure'].mean()
    
    # Exceptions
    '''
    (df['Churn']== 'Yes').mean()*100:.1f%
    df['MonthlyCharges'].mean():.0f%
    df['tenure'].mean():.1f%
    '''
    # Colors with KPIS.
    kpis = [
        ('Total Customers', len(df), '#2E86AB'),
        ('Churn Rate', f'{churn_rate}%', '#E74C3C'),
        ('Average Monthly Charges', f"${avg_month_ch}",'#F18F01'),
        ('Average Tenure', f'{avg_tenure} months', '#27AE60')
    ]
    
    # Displaying the KPIs.
    for i, (title, value, color) in enumerate(kpis):
        ax = fig.add_subplot(gs[0, i])
        ax.text(0.5, 0.7, str(value), ha='center', va='center',
                fontsize=32, fontweight='bold', color=color)
        ax.text(0.5, 0.3, title, ha='center', va='center',
                fontsize=14, color='gray')
        ax.set_xlim(0,1)
        ax.set_ylim(0,1)
        ax.axis('off')
        
        # Adding border.
        rect = Rectangle((0.05, 0.05), 0.9, 0.9, linewidth=2, edgecolor=color, facecolor='none')
        ax.add_patch(rect)
    
    # 2. Churn Distribution. Pie graph.
    ax1 = fig.add_subplot(gs[1, :2])
    churn_counts = df['Churn'].value_counts()
    colors = [COLORS['churn_no'], COLORS['churn_yes']]
    wedges, text, autotexts = ax1.pie(churn_counts.values, labels=churn_counts.index, colors = colors, autopct='%1.1f%%',startangle=90, textprops={'fontsize': 12})
    ax1.set_title('Customer Churn Distribution', fontsize=16, fontweight='bold', pad=20)
    
    # 3. Churn by Contract type. Bar graph.
    ax2 = fig.add_subplot(gs[2, :2])
    contract_churn = pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100
    contract_churn.plot(kind='bar', ax=ax2, color=[COLORS['churn_no'], COLORS['churn_yes']])
    ax2.set_title('Churn Rate by Contract Type', fontsize=16, fontweight='bold')
    ax2.set_ylabel('Churn Rate %')
    ax2.set_xlabel('Contract Type')
    ax2.legend(['No Churn','Churn'])
    ax2.tick_params(axis='x', rotation=45)
    
    # 4. Monthly Charges distribution. Histogram.
    ax3 = fig.add_subplot(gs[2,:2])
    for churn in ['No', 'Yes']:
        data = df[df['Churn'] == churn]['MonthlyCharges']
        color = COLORS['churn_no'] if churn == 'No' else COLORS['churn_yes']
        ax3.hist(data, alpha=0.7, label=f'Churn {churn}', bins=30, color=color)
    ax3.set_title('Montly Charges Distribution by Churn', fontsize=16, fontweight='bold')
    ax3.set_xlabel('Montly Charges %')
    ax3.set_ylabel('Frequency')
    ax3.legend()
    
    # 5. Tenure vs Monthly Charges. Scatter plot.
    ax4 = fig.add_subplot(gs[2, 2:])
    for churn in ['No', 'Yes']:
        data = df[df['Churn'] == churn]
        color = COLORS['churn_no'] if churn == 'No' else COLORS['churn_yes']
        ax4.scatter(data['tenure'], data['MonthlyCharges'], alpha=0.6, label=f'Churn: {churn}', color=color, s=20)
    ax4.set_title('Tenure vs Monthly Charges', fontsize=16, fontweight='bold')
    ax4.set_xlabel('Tenure (months)')
    ax4.set_ylabel('Monthly Charges $')
    ax4.legend()
    
    # Showing the plots.
    plt.tight_layout()
    plt.show()
        
    

In [40]:
create_overview_dashboard = overview_dashboard(df_clean)


Creating overview dashboard...


AttributeError: 'list' object has no attribute 'mean'

<Figure size 2000x1200 with 0 Axes>