In [3]:
# Importing necessary libraries
import os
import pandas as pd
import numpy as np
import re
import math
import seaborn as sns
import matplotlib.pyplot as plt

# Function to clean and floor numerical values
def clean_and_floor(value):
    if pd.isna(value):  # Check for NaN
        return value
    cleaned_value = re.sub(r'[^\d.]', '', str(value))  # Remove non-numeric characters
    if cleaned_value:
        return math.floor(float(cleaned_value))  # Convert to float and floor
    return None

# Function to categorize family size
def family_categorize(x):
    if x <= 1:
        return 'upto_1_member'
    elif x == 2:
        return '2_members'
    elif x == 3:
        return '3_members'
    else:
        return 'more_than_3_members'

# Function to categorize annuity percentage
def annuity_categorize(x):
    if x <= 2:
        return 'upto_2_percent'
    elif x <= 4:
        return 'upto_4_percent'
    elif x <= 6:
        return 'upto_6_percent'
    elif x <= 8:
        return 'upto_8_percent'
    else:
        return 'more_than_8_percent'

# Function to categorize income
def income_category(x):
    if x <= 6000:
        return 'income_band1'
    elif x <= 10000:
        return 'income_band2'
    elif x <= 15000:
        return 'income_band3'
    elif x <= 20000:
        return 'income_band4'
    elif x <= 50000:
        return 'income_band5'
    else:
        return 'income_band6'

# Function to categorize credit amount
def credit_category(x):
    if x <= 20000:
        return 'credit_band1'
    elif x <= 30000:
        return 'credit_band2'
    elif x <= 40000:
        return 'credit_band3'
    elif x <= 50000:
        return 'credit_band4'
    elif x <= 60000:
        return 'credit_band5'
    else:
        return 'credit_band6'

# Function to categorize credit-to-income ratio
def credit_income_category(x):
    if x <= 2:
        return 'Upto_2_times'
    elif x <= 3:
        return 'Upto_3_times'
    elif x <= 4:
        return 'Upto_4_times'
    elif x <= 5:
        return 'Upto_5_times'
    elif x <= 10:
        return 'Upto_10_times'
    else:
        return 'more_than_10_times'

# Function to categorize registration years
def registration_category(x):
    if x <= 5:
        return 'upto_5_years'
    elif x <= 10:
        return 'upto_10_years'
    elif x <= 15:
        return 'upto_15_years'
    elif x <= 20:
        return 'upto_20_years'
    elif x <= 30:
        return 'upto_30_years'
    else:
        return 'more_than_30_years'

# Function to categorize ID years
def id_years_category(x):
    if x <= 5:
        return 'upto_5_years'
    elif x <= 10:
        return 'upto_10_years'
    else:
        return 'more_than_10_years'

# Function to categorize employment days
def convert_employment_days(x):
    if x == 0:
        return 'no_employment'
    elif x <= 2:
        return 'upto_2_years'
    elif x <= 5:
        return 'upto_5_years'
    elif x <= 10:
        return 'upto_10_years'
    else:
        return 'more_than_10_years'

# Function to categorize age
def convert_age_days(x):
    if x <= 30:
        return 'upto_30_years'
    elif x <= 40:
        return 'upto_40_years'
    elif x <= 50:
        return 'upto_50_years'
    else:
        return 'more_than_50_years'

# Function to categorize phone change
def year_categorize(x):
    if x == 0:
        return 'same_year'
    elif x == 1:
        return 'one_year'
    elif x == 2:
        return 'two_year'
    elif x == 3:
        return 'three_year'
    elif x == 4:
        return 'four_year'
    else:
        return 'more_than_four'

# Function to categorize credit bureau searches
def bureau_categorize(x):
    if x == 0:
        return 'no_search'
    elif x == 1:
        return 'one_search'
    elif x == 2:
        return 'two_search'
    else:
        return 'more_than_two_search'

# Load dataset
df = pd.read_csv('Dataset.csv')

# Data cleaning and preprocessing
df['Client_Income'] = df['Client_Income'].apply(clean_and_floor)
df['Client_Income'] = df['Client_Income'].fillna(df['Client_Income'].mode().iloc[0])
df['Client_Income_category'] = df['Client_Income'].apply(income_category)

df['Credit_Amount'] = df['Credit_Amount'].apply(clean_and_floor)
df['Credit_Amount'] = df['Credit_Amount'].fillna(df['Credit_Amount'].mode().iloc[0])
df['Credit_Amount_category'] = df['Credit_Amount'].apply(credit_category)

df['Credit_to_Income_Ratio'] = (df['Credit_Amount'] / df['Client_Income']).round(2)
df['Credit_to_Income_Category'] = df['Credit_to_Income_Ratio'].apply(credit_income_category)

df['Loan_Annuity'] = df['Loan_Annuity'].apply(clean_and_floor)
df['Loan_Annuity'] = df['Loan_Annuity'].fillna(df['Loan_Annuity'].mode().iloc[0])
df['Loan_Annuity_percent'] = ((df['Loan_Annuity'] / df['Credit_Amount']) * 100).round(2)
df['Loan_Annuity_category'] = df['Loan_Annuity_percent'].apply(annuity_categorize)

df['Client_Family_Members'] = df['Client_Family_Members'].fillna(df['Child_Count'] + 1)
df['Client_Family_Members_Category'] = df['Client_Family_Members'].apply(family_categorize)

df['Car_Owned'] = df['Car_Owned'].fillna(0).astype(int)
df['Bike_Owned'] = df['Bike_Owned'].fillna(0).astype(int)
df['Active_Loan'] = df['Active_Loan'].fillna(0).astype(int)

df['House_Own'] = df['House_Own'].fillna(df['Own_House_Age'].notnull().astype(int))
df = df.drop(columns='Own_House_Age')

df['Registration_Days'] = pd.to_numeric(df['Registration_Days'], errors='coerce')
df['ID_Days'] = pd.to_numeric(df['ID_Days'], errors='coerce')
df['Registration_Days'] = df['Registration_Days'].fillna(df['Registration_Days'].mode().iloc[0])
df['ID_Days'] = df['ID_Days'].fillna(df['ID_Days'].mode().iloc[0])
df['Registration_Years'] = (df['Registration_Days'] / 365).round(0)
df['Registration_Years_Category'] = df['Registration_Years'].apply(registration_category)
df['ID_Years'] = (df['ID_Days'] / 365).round(0)
df['ID_Years_Category'] = df['ID_Years'].apply(id_years_category)

df['Employed_Days'] = df['Employed_Days'].apply(clean_and_floor)
df['Employed_Days'] = df['Employed_Days'].replace(365243, 0).fillna(0)
df['Employed_Days'] = (df['Employed_Days'] / 365).round(0)
df['Employed_Days_Category'] = df['Employed_Days'].apply(convert_employment_days)

df['Age_Days'] = df['Age_Days'].apply(clean_and_floor)
df['Age_Days'] = df['Age_Days'].fillna(6570)
df['Age_Days'] = (df['Age_Days'] / 365).round(0)
df['Age_Days_Category'] = df['Age_Days'].apply(convert_age_days)

df['Phone_Change'] = df['Phone_Change'].fillna(0)
df['Phone_Change_category'] = (df['Phone_Change'] / 365).astype(int).apply(year_categorize)

df['Credit_Bureau'] = df['Credit_Bureau'].fillna(0)
df['Credit_Bureau_Category'] = df['Credit_Bureau'].astype(int).apply(bureau_categorize)

# Save the processed data
df.to_csv('processed_data_new.csv', index=False)

  df = pd.read_csv('Dataset.csv')
