# Data Ingestion and Preprocessing
This notebook demonstrates the process of data ingestion, cleaning, and preprocessing for a loan default dataset. The steps include:
- Importing necessary libraries
- Loading the dataset
- Handling missing values
- Data cleaning and transformation
- Feature engineering
- Saving the processed data

In [None]:
# Step 1: Import Libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math
from sklearn.preprocessing import LabelEncoder

## Step 2: Load Dataset

In [None]:
# Load the dataset
file_path = 'Dataset.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Info:")
df.info()
print("\nFirst 5 Rows:")
df.head()

## Step 3: Initial Data Exploration

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

## Step 4: Data Cleaning

In [None]:
# Function to clean and floor numerical values
def clean_and_floor(value):
    if pd.isna(value):
        return value
    cleaned_value = re.sub(r'[^\d.]', '', str(value))
    if cleaned_value:
        return math.floor(float(cleaned_value))
    return None

# Apply cleaning to numerical columns
numerical_columns = ['Client_Income', 'Credit_Amount', 'Loan_Annuity']
for col in numerical_columns:
    df[col] = df[col].apply(clean_and_floor)

# Replace missing values with mode or mean
for col in numerical_columns:
    df[col] = df[col].fillna(df[col].mode().iloc[0])

## Step 5: Feature Engineering

In [None]:
# Categorize numerical columns into bins
def income_category(x):
    if x <= 6000:
        return 'income_band1'
    elif x <= 10000:
        return 'income_band2'
    elif x <= 15000:
        return 'income_band3'
    elif x <= 20000:
        return 'income_band4'
    elif x <= 50000:
        return 'income_band5'
    else:
        return 'income_band6'

df['Client_Income_category'] = df['Client_Income'].apply(income_category)

# Create new features
df['Credit_to_Income_Ratio'] = (df['Credit_Amount'] / df['Client_Income']).round(2)

# Categorize Credit_to_Income_Ratio
def credit_income_category(x):
    if x <= 2:
        return 'Upto_2_times'
    elif x <= 3:
        return 'Upto_3_times'
    elif x <= 4:
        return 'Upto_4_times'
    elif x <= 5:
        return 'Upto_5_times'
    elif x <= 10:
        return 'Upto_10_times'
    else:
        return 'more_than_10_times'

df['Credit_to_Income_Category'] = df['Credit_to_Income_Ratio'].apply(credit_income_category)

## Step 6: Handle Categorical Variables

In [None]:
# Fill missing values in categorical columns
categorical_columns = ['Client_Occupation', 'Type_Organization', 'Client_Education']
for col in categorical_columns:
    df[col] = df[col].fillna('Unknown')

# Encode categorical variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

## Step 7: Save Processed Data

In [None]:
# Save the processed dataset
output_file = 'processed_data.csv'
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")