# 01 Data Exploration

Auto-generated notebook for MLOps Project.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.utils.config import config
from src.utils.logger import setup_logger

%matplotlib inline
sns.set_style("whitegrid")


## Load Data

In [None]:

from src.data.load_data import load_data
import matplotlib.pyplot as plt
import seaborn as sns

df = load_data()
print(f"Dataset Shape: {df.shape}")
df.head()


## 1. Missing Value Analysis

In [None]:

missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
if len(missing_values) > 0:
    plt.figure(figsize=(10, 5))
    sns.barplot(x=missing_values.index, y=missing_values.values)
    plt.title("Missing Values per Column")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No missing values found.")


## 2. Target Distribution

In [None]:

plt.figure(figsize=(8, 5))
sns.countplot(x='churn', data=df)
plt.title("Churn Distribution")
plt.xlabel("Churn (0=No, 1=Yes)")
plt.ylabel("Count")
plt.show()

print(df['churn'].value_counts(normalize=True))


## 3. Correlation Analysis

Top features correlated with Churn:

In [None]:

plt.figure(figsize=(12, 10))
# Calculate correlation only for numeric columns
corr = df.corr()

# Plot heatmap of correlations with target
target_corr = corr[['churn']].sort_values(by='churn', ascending=False)
sns.heatmap(target_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Feature Correlation with Churn")
plt.show()


## 4. Numerical Feature Distributions

In [None]:

numerical_cols = ['TotalCharges', 'MonthlyCharges', 'tenure']
# Check which exist in df
present_cols = [c for c in numerical_cols if c in df.columns]

plt.figure(figsize=(15, 4 * len(present_cols)))
for i, col in enumerate(present_cols):
    plt.subplot(len(present_cols), 2, 2*i + 1)
    sns.histplot(data=df, x=col, hue='churn', kde=True, element="step")
    plt.title(f"Distribution of {col} by Churn")
    
    plt.subplot(len(present_cols), 2, 2*i + 2)
    sns.boxplot(x='churn', y=col, data=df)
    plt.title(f"Boxplot of {col} by Churn")

plt.tight_layout()
plt.show()


## 5. Categorical Feature Analysis


We analyze how categorical features like `Contract`, `PaymentMethod`, and `InternetService` relate to Churn. 
We use **normalized stacked bar charts** to compare the *proportion* of churners across categories.


In [None]:

cat_cols = ['Contract', 'PaymentMethod', 'InternetService', 'TechSupport']
# Filter for cols that exist
cat_cols = [c for c in cat_cols if c in df.columns]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cat_cols):
    plt.subplot(2, 2, i+1)
    
    # Calculate proportions
    crosstab = pd.crosstab(df[col], df['churn'], normalize='index')
    
    # Plot stacked bar
    crosstab.plot(kind='bar', stacked=True, ax=plt.gca(), color=['#3498db', '#e74c3c'])
    
    plt.title(f"Churn Rate by {col}")
    plt.xlabel(col)
    plt.ylabel("Proportion")
    plt.legend(title='Churn', labels=['No', 'Yes'], loc='upper right')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()



**Observations:**
- **Month-to-month contracts** likely have much higher churn than 1 or 2-year contracts.
- **Electronic check** payment method often correlates with higher churn.
- Customers with **Fiber optic** internet may have higher churn due to price or competition.


## 6. Feature Interactions

Do long-tenured customers pay more? Does paying more increase churn risk for them?

In [None]:

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='tenure', y='MonthlyCharges', hue='churn', alpha=0.5, palette=['#3498db', '#e74c3c'])
plt.title("Tenure vs Monthly Charges (colored by Churn)")
plt.show()


## 7. Cohort Analysis (Tenure Bins)

In [None]:

def tenure_cohort(t):
    if t <= 12: return '0-1 Year'
    elif t <= 24: return '1-2 Years'
    elif t <= 48: return '2-4 Years'
    else: return '4+ Years'

if 'tenure' in df.columns:
    df['TenureCohort'] = df['tenure'].apply(tenure_cohort)
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x='TenureCohort', y='churn', data=df, order=['0-1 Year', '1-2 Years', '2-4 Years', '4+ Years'], ci=None, palette='viridis')
    plt.title("Churn Rate by Tenure Cohort")
    plt.ylabel("Churn Rate")
    plt.show()


## 8. Outlier Detection

Using Boxplots to identify outliers in 'TotalCharges'.

In [None]:

if 'TotalCharges' in df.columns:
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=df['TotalCharges'])
    plt.title("TotalCharges Boxplot")
    plt.show()
