In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
import re

#### Load Data

In [5]:
def load_data(data_path="../data/"):
    """Load the credit scoring dataset."""
    # Check if the data directory exists
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Data directory '{data_path}' not found.")
    
    # Look for CSV files in the data directory
    csv_files = [f for f in os.listdir(data_path) if f.endswith('train.csv')]
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in '{data_path}'.")
    
    # Load the first CSV file found
    file_path = os.path.join(data_path, csv_files[0])
    print(f"Loading data from: {file_path}")
    
    return pd.read_csv(file_path)


In [None]:
df = load_data()

#### Overview

In [None]:
df.head()

Loading data from: ../data/train.csv


  return pd.read_csv(file_path)


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [9]:
print(f"Number of rows : {len(df)}")

Number of rows : 100000


In [15]:
total_memory = df.memory_usage(deep=True).sum() / (1024**2)
print(f"Total data memory usage: {total_memory:.2f} MB")

Total data memory usage: 120.56 MB


In [18]:
print(f"List all column names :\n\n {df.columns}")

List all column names :

 Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')


In [22]:
# dropping unnecessary columns for modelling
columns_to_drop = ["ID", "Customer_ID", "Month", "Name", "SSN"]
df = df.drop(columns=columns_to_drop)

In [24]:
print(f"Number of columns :‌ {len(df.columns)}")

Number of columns :‌ 23


#### Data Structure & Types

In [25]:
df.dtypes

Age                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                 object
dtype: object

In [28]:
def convert_data_types(df):
    """
    Convert incorrect data types in the credit score dataset to appropriate types.
    
    Parameters:
    df (pandas.DataFrame): The input dataframe with incorrect data types
    
    Returns:
    pandas.DataFrame: A dataframe with corrected data types
    """
    # Create a copy to avoid modifying the original dataframe
    df_converted = df.copy()
    
    # 1. Convert Age from object to int64
    df_converted['Age'] = pd.to_numeric(df_converted['Age'], errors='coerce').astype('Int64')
    
    # 2. Convert Annual_Income from object to float64
    # Remove non-numeric characters (like $, commas, etc.)
    df_converted['Annual_Income'] = df_converted['Annual_Income'].replace(r'[$,]', '', regex=True)
    df_converted['Annual_Income'] = pd.to_numeric(df_converted['Annual_Income'], errors='coerce')
    
    # 3. Convert Num_of_Loan from object to int64
    df_converted['Num_of_Loan'] = pd.to_numeric(df_converted['Num_of_Loan'], errors='coerce').astype('Int64')
    
    # 4. Convert Num_of_Delayed_Payment from object to int64
    df_converted['Num_of_Delayed_Payment'] = pd.to_numeric(df_converted['Num_of_Delayed_Payment'], errors='coerce').astype('Int64')
    
    # 5. Convert Changed_Credit_Limit from object to float64
    df_converted['Changed_Credit_Limit'] = pd.to_numeric(df_converted['Changed_Credit_Limit'], errors='coerce')
    
    # 6. Convert Outstanding_Debt from object to float64
    df_converted['Outstanding_Debt'] = df_converted['Outstanding_Debt'].replace(r'[$,]', '', regex=True)
    df_converted['Outstanding_Debt'] = pd.to_numeric(df_converted['Outstanding_Debt'], errors='coerce')
    
    # 7. Convert Credit_History_Age from object to float64 (in years)
    def extract_credit_history_age(age_str):
        if pd.isna(age_str) or not isinstance(age_str, str):
            return np.nan
        
        years = re.search(r'(\d+)\s*Years?', age_str)
        months = re.search(r'(\d+)\s*Months?', age_str)
        
        total_years = 0
        if years:
            total_years += int(years.group(1))
        if months:
            total_years += int(months.group(1)) / 12
            
        return total_years if total_years > 0 else np.nan
    
    df_converted['Credit_History_Age'] = df_converted['Credit_History_Age'].apply(extract_credit_history_age)
    
    # 8. Convert Amount_invested_monthly from object to float64
    df_converted['Amount_invested_monthly'] = df_converted['Amount_invested_monthly'].replace(r'[$,]', '', regex=True)
    df_converted['Amount_invested_monthly'] = pd.to_numeric(df_converted['Amount_invested_monthly'], errors='coerce')
    
    # 9. Convert Monthly_Balance from object to float64
    df_converted['Monthly_Balance'] = df_converted['Monthly_Balance'].replace(r'[$,]', '', regex=True)
    df_converted['Monthly_Balance'] = pd.to_numeric(df_converted['Monthly_Balance'], errors='coerce')
    
    # 10. Convert Credit_Score if needed (keeping as object if it's categorical)
    # Check if Credit_Score contains only numeric values
    try:
        if df_converted['Credit_Score'].str.isnumeric().all():
            df_converted['Credit_Score'] = pd.to_numeric(df_converted['Credit_Score'], errors='coerce')
    except:
        # Keep as is if we can't determine if it's numeric
        pass
    
    return df_converted

In [29]:
df_converted = convert_data_types(df)
print(df_converted.dtypes)  # Verify the data types

Age                           Int64
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                   Int64
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment        Int64
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age          float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                 object
dtype: object
