In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from evidently import Report
from evidently.metrics import *
from evidently.presets import *
import scipy.stats as ss
import os 
from scipy.stats import pointbiserialr

In [None]:
def save_figure(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    """
    Saves the current matplotlib figure to ../reports/figures
    
    Parameters:
    - fig_id (str): The name of the file (without extension).
    - tight_layout (bool): Whether to auto-adjust subplots padding.
    - fig_extension (str): 'png', 'jpg', 'svg', or 'pdf'.
    - resolution (int): DPI (dots per inch) for high-quality images.
    """
    # 1. Define the path (relative to the notebook's location in 'notebooks/')
    # Using '..' to go up one level to project root, then into reports/figures
    path = os.path.join("..", "reports", "figures")
    
    # 2. Create directory if it doesn't exist
    os.makedirs(path, exist_ok=True)
    
    # 3. Construct full file path
    file_path = os.path.join(path, f"{fig_id}.{fig_extension}")
    
    print(f"Saving figure {fig_id} to {file_path}...")
    
    if tight_layout:
        plt.tight_layout()
        
    plt.savefig(file_path, format=fig_extension, dpi=resolution)

In [None]:
categorical_cols = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]

numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

target = 'Churn'


In [None]:
df = pd.read_csv(r"../data/raw/splits/train.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.Churn.value_counts()

In [None]:
def fix_total_charges(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts TotalCharges to numeric and fills NaN values 
    (which arise from empty strings) with 0.
    """
    # Force convert to numeric, errors become NaN
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    
    # Check how many failed
    num_missing = df['TotalCharges'].isnull().sum()
    print(f"⚠️ Found {num_missing} non-numeric values in TotalCharges. Filling with 0.")
    
    # Fill with 0 (assuming tenure=0 implies no charges yet)
    df['TotalCharges'] = df['TotalCharges'].fillna(0)
    return df

# Apply it
df = fix_total_charges(df)

In [None]:
def plot_target_distribution(df, target_col='Churn'):
    plt.figure(figsize=(6, 4))
    ax = sns.countplot(x=target_col, data=df, palette='viridis')
    
    # Add percentages on bars
    total = len(df)
    for p in ax.patches:
        percentage = f'{100 * p.get_height() / total:.1f}%'
        x = p.get_x() + p.get_width() / 2
        y = p.get_height()
        ax.annotate(percentage, (x, y), ha='center', va='bottom')
    
    plt.title(f'Distribution of {target_col}')
    plt.show()

plot_target_distribution(df)

In [None]:
def analyze_numerical_features(df, numerical_cols, target='Churn'):
    fig, axes = plt.subplots(len(numerical_cols), 2, figsize=(14, 5 * len(numerical_cols)))
    
    for i, col in enumerate(numerical_cols):
        # Plot 1: Distribution (Histogram/KDE)
        sns.histplot(data=df, x=col, hue=target, kde=True, element="step", ax=axes[i, 0])
        axes[i, 0].set_title(f'Distribution of {col} by {target}')
        
        # Plot 2: Boxplot (To see outliers and median differences)
        sns.boxplot(data=df, x=target, y=col, ax=axes[i, 1])
        axes[i, 1].set_title(f'{col} vs {target}')
        
    plt.tight_layout()
    plt.show()

num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
analyze_numerical_features(df, num_cols)

In [None]:
def analyze_categorical_features(df, categorical_cols, target='Churn'):
    # Calculate grid size needed
    n_cols = 3
    n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
    
    plt.figure(figsize=(20, 5 * n_rows))
    
    for i, col in enumerate(categorical_cols):
        plt.subplot(n_rows, n_cols, i + 1)
        
        # Plot countplot with hue=Churn
        sns.countplot(x=col, hue=target, data=df, palette='pastel')
        plt.title(f'{col} Distribution')
        plt.xticks(rotation=45)
        plt.legend(title=target, loc='upper right')
        
    plt.tight_layout()
    plt.show()

# Define your categoricals (excluding ID and numericals)
cat_cols = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
    'Contract', 'PaperlessBilling', 'PaymentMethod'
]

analyze_categorical_features(df, cat_cols)

In [None]:
def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    r_corr = r - ((r-1)**2)/(n-1)
    k_corr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))


cramer_results = {}

for col in categorical_cols:
    confusion_mat = pd.crosstab(df[col], df[target])
    cramer_results[col] = cramers_v(confusion_mat)

pd.DataFrame(cramer_results, index=['CramerV']).T.sort_values(by='CramerV', ascending=False)

In [None]:
# 1. Ensure Target is Binary (0/1)
df['Churn_binary'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 2. Dictionary to store results
biserial_results = {}

for col in numerical_cols:
    # Drop NaNs just for this calculation to prevent errors
    valid_data = df[[col, 'Churn_binary']].dropna()
    
    corr, p_value = pointbiserialr(valid_data[col], valid_data['Churn_binary'])
    biserial_results[col] = corr

# 3. Create DataFrame
biserial_df = pd.DataFrame.from_dict(biserial_results, orient='index', columns=['Point_Biserial_Corr'])
biserial_df.reset_index(inplace=True)
biserial_df.rename(columns={'index': 'Feature'}, inplace=True)

# Sort by absolute correlation (magnitude matters most)
biserial_df['Abs_Corr'] = biserial_df['Point_Biserial_Corr'].abs()
biserial_df = biserial_df.sort_values(by='Abs_Corr', ascending=False)

# 4. Visualize
plt.figure(figsize=(8, 4))
sns.barplot(x='Point_Biserial_Corr', y='Feature', data=biserial_df, palette='coolwarm')
plt.title("Correlation: Numerical Features vs Churn")
plt.xlabel("Correlation Coefficient (-1 to 1)")
plt.axvline(x=0, color='black', linewidth=1)

# Save it using your new function!
save_figure("numerical_correlation")
plt.show()