In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
import re

#### Load Data

In [5]:
def load_data(data_path="../data/"):
    """Load the credit scoring dataset."""
    # Check if the data directory exists
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Data directory '{data_path}' not found.")
    
    # Look for CSV files in the data directory
    csv_files = [f for f in os.listdir(data_path) if f.endswith('train.csv')]
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in '{data_path}'.")
    
    # Load the first CSV file found
    file_path = os.path.join(data_path, csv_files[0])
    print(f"Loading data from: {file_path}")
    
    return pd.read_csv(file_path)


In [None]:
df = load_data()

## Overview

In [None]:
df.head()

Loading data from: ../data/train.csv


  return pd.read_csv(file_path)


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Age                       100000 non-null  object 
 1   Occupation                100000 non-null  object 
 2   Annual_Income             100000 non-null  object 
 3   Monthly_Inhand_Salary     84998 non-null   float64
 4   Num_Bank_Accounts         100000 non-null  int64  
 5   Num_Credit_Card           100000 non-null  int64  
 6   Interest_Rate             100000 non-null  int64  
 7   Num_of_Loan               100000 non-null  object 
 8   Type_of_Loan              88592 non-null   object 
 9   Delay_from_due_date       100000 non-null  int64  
 10  Num_of_Delayed_Payment    92998 non-null   object 
 11  Changed_Credit_Limit      100000 non-null  object 
 12  Num_Credit_Inquiries      98035 non-null   float64
 13  Credit_Mix                100000 non-null  ob

In [9]:
print(f"Number of rows : {len(df)}")

Number of rows : 100000


In [15]:
total_memory = df.memory_usage(deep=True).sum() / (1024**2)
print(f"Total data memory usage: {total_memory:.2f} MB")

Total data memory usage: 120.56 MB


In [18]:
print(f"List all column names :\n\n {df.columns}")

List all column names :

 Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')


In [24]:
print(f"Number of columns :‌ {len(df.columns)}")

Number of columns :‌ 23


## Data Structure & Types

#### Data Types 
- Check data types of each column

In [50]:
df.dtypes

Age                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                 object
dtype: object

#### Categorical vs. Numerical

In [48]:
# summary statistics for numerical
df.describe()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0,100000.0
mean,4194.17085,17.09128,22.47443,72.46604,21.06878,27.754251,32.285173,1403.118217
std,3183.686167,117.404834,129.05741,466.422621,14.860104,193.177339,5.116875,8306.04127
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.0,0.0
25%,1625.568229,3.0,4.0,8.0,10.0,3.0,28.052567,30.30666
50%,3093.745,6.0,5.0,13.0,18.0,6.0,32.305784,69.249473
75%,5957.448333,7.0,7.0,20.0,28.0,9.0,36.496663,161.224249
max,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,82331.0


In [35]:
def categorical_summary_stats(df, categorical_columns=None):
    """
    Generate summary statistics for categorical variables in a dataframe.
    
    Parameters:
    df (pandas.DataFrame): The dataframe to analyze
    categorical_columns (list, optional): List of categorical column names to analyze.
                                         If None, will try to identify categorical columns.
    
    Returns:
    dict: Dictionary where keys are column names and values are dictionaries of summary statistics
    """    
    # If no categorical columns specified, try to identify them
    if categorical_columns is None:
        # Select object, category, and boolean dtypes
        categorical_columns = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
        
        # Also include numeric columns with low cardinality (fewer than 10 unique values)
        for col in df.select_dtypes(include=['number']).columns:
            if df[col].nunique() < 10:
                categorical_columns.append(col)
    
    summary = {}
    
    for col in categorical_columns:
        if col not in df.columns:
            continue
            
        # Basic stats
        value_counts = df[col].value_counts()
        unique_values = df[col].unique()
        missing_values = df[col].isna().sum()
        
        # Get mode (most frequent value)
        mode_value = df[col].mode()[0] if not df[col].empty else None
        mode_count = value_counts.iloc[0] if not value_counts.empty else 0
        mode_percentage = (mode_count / len(df)) * 100 if len(df) > 0 else 0
        
        # Get top 5 categories with counts and percentages
        top_categories = []
        for value, count in value_counts.head(5).items():
            percentage = (count / len(df)) * 100
            top_categories.append({
                'value': value,
                'count': count,
                'percentage': round(percentage, 2)
            })
        
        # Compile column summary
        summary[col] = {
            'unique_values': len(unique_values),
            'missing_values': missing_values,
            'missing_percentage': round((missing_values / len(df)) * 100, 2) if len(df) > 0 else 0,
            'mode': {
                'value': mode_value,
                'count': mode_count,
                'percentage': round(mode_percentage, 2)
            },
            'top_categories': top_categories,
            'all_categories': [str(v) for v in unique_values]
        }
    
    return summary

In [49]:
categorical_columns = [
    "Occupation", 
    "Type_of_Loan", 
    "Credit_Mix", 
    "Payment_of_Min_Amount", 
    "Payment_Behaviour", 
    "Credit_Score"
]

cat_stats = categorical_summary_stats(df)

# Print summary for one column
col_name = categorical_columns[0]

stats = cat_stats[col_name]
print(f"Summary for {col_name}:")
print(f"- Unique values: {stats['unique_values']}")
print(f"- Most common value: {stats['mode']['value']} (occurs {stats['mode']['percentage']}% of the time)")
print(f"- Top categories:")
for cat in stats['top_categories']:
    print(f"  • {cat['value']}: {cat['count']} ({cat['percentage']}%)")

Summary for Occupation:
- Unique values: 16
- Most common value: _______ (occurs 7.06% of the time)
- Top categories:
  • _______: 7062 (7.06%)
  • Lawyer: 6575 (6.58%)
  • Architect: 6355 (6.35%)
  • Engineer: 6350 (6.35%)
  • Scientist: 6299 (6.3%)
