# Data Setup

## Imports

In [1]:
import kagglehub
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from src.__00__paths import raw_data_dir, processed_data_dir, curated_data_dir
from pathlib import Path
import shutil

## Dataset Download

In [2]:
# List of files to check
raw_dataset = raw_data_dir / "loan_approval_dataset.csv"

# Check and download
if raw_dataset.exists():
    print("✔️ Dataset is already downloaded.")
else:
    # Download dataset
    dataset_path = Path(kagglehub.dataset_download("architsharma01/loan-approval-prediction-dataset"))

    if not dataset_path.exists():
        raise FileNotFoundError("⚠ Dataset not found.")

    # Check for an extra "Data" folder
    data_root = dataset_path / "Data" if (dataset_path / "Data").exists() else dataset_path

    # Copy files/folders to raw_data_dir
    for item in data_root.iterdir():
        target = raw_data_dir / item.name
        if item.is_file():
            shutil.copy2(item, target)

    print("✔️ Dataset successfully downloaded.")

✔️ Dataset is already downloaded.


## Load Datasets

In [6]:
raw_df = pd.read_csv(raw_dataset)
raw_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## Data Preprocessing

In [18]:
# Strip Spaces in Feature title
raw_df.columns = raw_df.columns.str.strip()

# Drop ID
processed_df = raw_df.drop(columns=['loan_id'])

# Label Encoding
processed_df['education'] = processed_df['education'].map({" Graduate": 1, " Not Graduate": 0})
processed_df['self_employed'] = processed_df['self_employed'].map({" Yes": 1, " No": 0})
processed_df['loan_status'] = processed_df['loan_status'].map({" Approved": 1, " Rejected": 0})

# Normalizing Numerical Features
numerical_features = ['income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
                      'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
scaler = StandardScaler()
processed_df[numerical_features] = scaler.fit_transform(processed_df[numerical_features])

processed_df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,1,0,1.617979,1.633052,0.192617,1.032792,-0.780058,2.877289,0.832028,0.930304,1
1,0,0,1,-0.34175,-0.324414,-0.508091,-1.061051,-0.733924,-0.631921,-0.694993,-0.515936,0
2,3,1,0,1.439822,1.610933,1.594031,-0.54484,-0.0573,-0.107818,1.99652,2.407316,0
3,3,1,0,1.119139,1.721525,-0.508091,-0.771045,1.649637,-0.381263,0.897943,0.899533,0
4,5,0,1,1.689242,1.002681,1.594031,-1.264055,0.757724,0.735304,1.568075,0.007172,0


## Save Processed Data

In [19]:
processed_df.to_csv(processed_data_dir / 'processed_data.csv', index=False)
print(f"✔️ Dataset successfully saved to {'/'.join((processed_data_dir / 'processed_data.csv').parts[-3:])}")

✔️ Dataset successfully saved to data/processed/processed_data.csv


## Split & Save Curated Datasets

In [23]:
train_df, test_df = train_test_split(processed_df, test_size=0.2, random_state=42)

train_df.to_csv(curated_data_dir / 'train.csv', index=False)
test_df.to_csv(curated_data_dir / 'test.csv', index=False)

print(f"✔️ Test-Train Datasets saved to {'/'.join(curated_data_dir.parts[-2:])}")

✔️ Test-Train Datasets saved to data/curated
