In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re

## Load Data

In [41]:
def load_data(data_path="../data/"):
    """Load the credit scoring dataset."""
    # Check if the data directory exists
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Data directory '{data_path}' not found.")
    
    # Look for CSV files in the data directory
    csv_files = [f for f in os.listdir(data_path) if f.endswith('train.csv')]
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in '{data_path}'.")
    
    # Load the first CSV file found
    file_path = os.path.join(data_path, csv_files[0])
    print(f"Loading data from: {file_path}")
    
    return pd.read_csv(file_path)


In [42]:
df = load_data()





## Overview

In [43]:
df.shape



In [44]:
df.head()



In [45]:
df.info()



In [46]:
print(f"Number of rows : {len(df)}")



In [47]:
total_memory = df.memory_usage(deep=True).sum() / (1024**2)
print(f"Total data memory usage: {total_memory:.2f} MB")



In [48]:
print(f"List all column names :\n\n {df.columns}")



In [49]:
print(f"Number of columns :‌ {len(df.columns)}")



## Missing Data

In [51]:
from lib.clean_data import report_missing_data

In [52]:
report_missing_data(df)





## Duplicates Value

In [None]:
duplicates = df[df.duplicated()]
print(duplicates)



## Data Structure & Types

#### Data Types 
- Check data types of each column

In [None]:
df.dtypes



In [None]:
unique_months = df['Month'].unique()
print(unique_months)



#### Categorical vs. Numerical

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("Categorical Columns:", categorical_columns)
print("Numerical Columns:", numerical_columns)



In [None]:
discrete_vars = [col for col in numerical_columns if df[col].nunique() < 100 and df[col].dtype == 'int64']
continuous_vars = [col for col in numerical_columns if df[col].dtype == 'float64' or df[col].nunique() >= 100]
print("Discrete Variables:", discrete_vars)
print("Continuous Variables:", continuous_vars)



In [None]:
# summary statistics for numerical
df.describe()



In [None]:
def categorical_summary_stats(df, categorical_columns=None):
    """
    Generate summary statistics for categorical variables in a dataframe.
    
    Parameters:
    df (pandas.DataFrame): The dataframe to analyze
    categorical_columns (list, optional): List of categorical column names to analyze.
                                         If None, will try to identify categorical columns.
    
    Returns:
    dict: Dictionary where keys are column names and values are dictionaries of summary statistics
    """    
    # If no categorical columns specified, try to identify them
    if categorical_columns is None:
        # Select object, category, and boolean dtypes
        categorical_columns = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
        
        # Also include numeric columns with low cardinality (fewer than 10 unique values)
        for col in df.select_dtypes(include=['number']).columns:
            if df[col].nunique() < 10:
                categorical_columns.append(col)
    
    summary = {}
    
    for col in categorical_columns:
        if col not in df.columns:
            continue
            
        # Basic stats
        value_counts = df[col].value_counts()
        unique_values = df[col].unique()
        missing_values = df[col].isna().sum()
        
        # Get mode (most frequent value)
        mode_value = df[col].mode()[0] if not df[col].empty else None
        mode_count = value_counts.iloc[0] if not value_counts.empty else 0
        mode_percentage = (mode_count / len(df)) * 100 if len(df) > 0 else 0
        
        # Get top 5 categories with counts and percentages
        top_categories = []
        for value, count in value_counts.head(5).items():
            percentage = (count / len(df)) * 100
            top_categories.append({
                'value': value,
                'count': count,
                'percentage': round(percentage, 2)
            })
        
        # Compile column summary
        summary[col] = {
            'unique_values': len(unique_values),
            'missing_values': missing_values,
            'missing_percentage': round((missing_values / len(df)) * 100, 2) if len(df) > 0 else 0,
            'mode': {
                'value': mode_value,
                'count': mode_count,
                'percentage': round(mode_percentage, 2)
            },
            'top_categories': top_categories,
            'all_categories': [str(v) for v in unique_values]
        }
    
    return summary

In [None]:
def categorical_summary_stats_result(col_name):
    cat_stats = categorical_summary_stats(df)
    stats = cat_stats[col_name]
    print(f"Summary for {col_name}:")
    print(f"- Unique values: {stats['unique_values']}")
    print(f"- Most common value: {stats['mode']['value']} (occurs {stats['mode']['percentage']}% of the time)")
    print(f"- Top categories:")
    for cat in stats['top_categories']:
        print(f"  • {cat['value']}: {cat['count']} ({cat['percentage']}%)")
    # print(f"\n {stats['all_categories']}")

In [None]:
categorical_columns = [
    "Occupation", 
    "Type_of_Loan", 
    "Credit_Mix", 
    "Payment_of_Min_Amount", 
    "Payment_Behaviour", 
    "Credit_Score"
]

for col in categorical_columns:
    categorical_summary_stats_result(col)
    print("\n")




#### Mixed Data Types

In [None]:
for col in df.columns:
    unique_types = set(df[col].apply(type))
    if len(unique_types) > 1:
        print(f"Column '{col}' has mixed types: {unique_types}")



#### Handle Special Columns

In [None]:
pd.set_option('display.max_columns', None)  # Show all columns
df.head()



In [None]:
df.Type_of_Loan.head()



In [None]:
df.columns



## Outliers

In [None]:
df.describe()



In [None]:
from scipy.stats import stats

In [None]:
z_scores = stats.zscore(df['Monthly_Inhand_Salary'])
(df['z_score'] > 3).sum()



