# Setup

In [None]:
from pathlib import Path
import pandas as pd
import sys
import yaml
import joblib

ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH))

from src.utils.features import add_features
from src.utils.preprocessing import prepare_train_val_split, dedup
from src.utils.config import (
    SAVE_PATH,
    CONFIG_PATH
)


with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load and Clean Data

In [2]:
main_file_path = config["general"]["main_file_path"]
bureau_file_path = config["general"]["bureau_file_path"]
bureaubal_file_path = config["general"]["bureaubal_file_path"]
prevapp_file_path = config["general"]["prevapp_file_path"]
credit_card_path = config["general"]["credit_card_path"]
installments_path = config["general"]["installments_path"]
pos_cash_path = config["general"]["pos_cash_path"]

# Load datasets
main_df = pd.read_csv(main_file_path)
bureau_df = pd.read_csv(bureau_file_path)
bureau_balance_df = pd.read_csv(bureaubal_file_path)
previous_application_df = pd.read_csv(prevapp_file_path)
credit_card_df = pd.read_csv(credit_card_path)
installments_df = pd.read_csv(installments_path)
pos_cash_df = pd.read_csv(pos_cash_path)

# Get features from other files
main_df = add_features(main_df, bureau_df, bureau_balance_df,
                       previous_application_df, credit_card_df, installments_df, pos_cash_df)

# Deduplicate
main_df = dedup(main_df)

No duplicated examples found!


In [3]:
X, y, X_train, X_val, y_train, y_val = prepare_train_val_split(main_df)

# Save the data for further use
joblib.dump({
    "X": X,
    "y": y,
    "X_train": X_train,
    "X_val": X_val,
    "y_train": y_train,
    "y_val": y_val
}, SAVE_PATH / "data.pkl")

['C:\\Users\\rober\\Custom testing\\Home-Credit-Default-Risk\\saved\\data.pkl']