# Project Overview

#### Problem Statement
Briefly describe the prediction task and why it matters for the business or users.

#### Dataset Description
Describe the dataset size, source, and feature types (numerical, categorical, text, etc.).

#### Target Variable and Eval Metrics
Define the target variable clearly. Specify the primary evaluation metric(s) and why they are appropriate for the problem.

# Imports and Setup

In [None]:
from pathlib import Path
import pandas as pd
import sys
import yaml
import joblib

ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH))

from src.utils.config import (
    SAVE_PATH,
    CONFIG_PATH
)

with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# file_path = config["general"]["file_path"]

from src.utils.preprocessing import prepare_train_test_split, dedup

from features import add_custom_features

C:\Users\rober\Documents\GitHub\ML-Templates


# Load and Clean Data

In [2]:
main_file_path = config["general"]["main_file_path"]
bureau_file_path = config["general"]["bureau_file_path"]
bureaubal_file_path = config["general"]["bureaubal_file_path"]
prevapp_file_path = config["general"]["prevapp_file_path"]

# Load datasets
main_df = pd.read_csv(main_file_path)
bureau_df = pd.read_csv(bureau_file_path)
bureau_balance_df = pd.read_csv(bureaubal_file_path)
previous_application_df = pd.read_csv(prevapp_file_path)

main_df = add_custom_features(main_df, bureau_df, bureau_balance_df, previous_application_df)

# Deduplicate
main_df = dedup(main_df)

In [3]:
X, y, X_train, X_test, y_train, y_test = prepare_train_test_split(main_df)

# Save the datasets for further use
joblib.dump({
    "X": X,
    "y": y,
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test
}, SAVE_PATH / "data.pkl")

['C:\\Users\\rober\\Documents\\GitHub\\ML-Templates\\saved\\data.pkl']