In [None]:
# 01_data_exploration.ipynb
# -------------------------------------
# Purpose: Perform initial exploratory data analysis (EDA) on the loan_data.csv file.
# Steps:
# 1. Load raw dataset
# 2. Explore data structure, missing values, and summary statistics
# 3. Visualize distributions and relationships
# 4. Save processed data for further feature engineering
# -------------------------------------

# Import required libraries
from __future__ import annotations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple

# Set visualization styles
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14

# ---------------------------
# 1. Load Dataset
# ---------------------------

def load_data(filepath: str) -> pd.DataFrame:
    """
    Load dataset from a CSV file.

    Args:
        filepath (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataset as a DataFrame.
    """
    df: pd.DataFrame = pd.read_csv(filepath)
    return df

# Load raw loan data
data_path: str = "../data/raw/loan_data.csv"
df: pd.DataFrame = load_data(data_path)

# Preview dataset
display(df.head())
display(df.info())
display(df.describe())

# ---------------------------
# 2. Basic Data Checks
# ---------------------------

def check_missing_values(data: pd.DataFrame) -> pd.Series:
    """
    Check missing values for each column.

    Args:
        data (pd.DataFrame): Input DataFrame.
    Returns:
        pd.Series: Count of missing values per column.
    """
    return data.isnull().sum()

def check_duplicates(data: pd.DataFrame) -> int:
    """
    Check for duplicate rows in the dataset.

    Args:
        data (pd.DataFrame): Input DataFrame.
    Returns:
        int: Number of duplicate rows.
    """
    return data.duplicated().sum()

missing_values: pd.Series = check_missing_values(df)
duplicate_count: int = check_duplicates(df)

print("Missing Values:\n", missing_values)
print("\nNumber of Duplicate Rows:", duplicate_count)

# ---------------------------
# 3. Univariate Analysis
# ---------------------------

def plot_numerical_distribution(data: pd.DataFrame, columns: list[str]) -> None:
    """
    Plot histograms for numerical features.

    Args:
        data (pd.DataFrame): Input DataFrame.
        columns (list[str]): List of numerical column names.
    """
    data[columns].hist(bins=30, figsize=(15, 10), edgecolor='black')
    plt.suptitle('Distribution of Numerical Features', fontsize=16)
    plt.show()

def plot_categorical_distribution(data: pd.DataFrame, column: str) -> None:
    """
    Plot bar chart for a categorical feature.

    Args:
        data (pd.DataFrame): Input DataFrame.
        column (str): Categorical column name.
    """
    sns.countplot(x=column, data=data, palette='Set2')
    plt.title(f"Distribution of {column}", fontsize=14)
    plt.xticks(rotation=45)
    plt.show()

# Identify numeric and categorical columns
numeric_cols: list[str] = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols: list[str] = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Plot numerical and categorical features
plot_numerical_distribution(df, numeric_cols)
for col in categorical_cols:
    plot_categorical_distribution(df, col)

# ---------------------------
# 4. Bivariate Analysis
# ---------------------------

def plot_correlation_matrix(data: pd.DataFrame) -> None:
    """
    Plot correlation heatmap for numerical features.

    Args:
        data (pd.DataFrame): Input DataFrame.
    """
    corr_matrix = data.corr(numeric_only=True)
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Matrix", fontsize=16)
    plt.show()

plot_correlation_matrix(df)

# Example: Relationship between 'credit_score' and 'loan_status'
if 'credit_score' in df.columns and 'loan_status' in df.columns:
    sns.boxplot(x='loan_status', y='credit_score', data=df, palette='Set3')
    plt.title("Credit Score by Loan Status")
    plt.show()

# ---------------------------
# 5. Save Cleaned Data (Optional)
# ---------------------------

def save_processed_data(data: pd.DataFrame, filepath: str) -> None:
    """
    Save DataFrame to CSV.

    Args:
        data (pd.DataFrame): Data to save.
        filepath (str): Destination CSV path.
    """
    data.to_csv(filepath, index=False)
    print(f"Processed data saved at {filepath}")

# Save a copy of the data (without duplicates)
df_cleaned = df.drop_duplicates()
save_processed_data(df_cleaned, "../data/processed/credit_data_cleaned.csv")
