# Project Overview

#### Problem Statement
Briefly describe the prediction task and why it matters for the business or users.

#### Dataset Description
Describe the dataset size, source, and feature types (numerical, categorical, text, etc.).

#### Target Variable and Eval Metrics
Define the target variable clearly. Specify the primary evaluation metric(s) and why they are appropriate for the problem.

# Imports and Setup

In [None]:
from pathlib import Path
import sys
import yaml
import joblib

ROOT = Path().resolve().parent
SAVE_DIR = ROOT / "saved"
sys.path.append(str(ROOT))

with open(ROOT / "config.yaml", "r") as f:
    config = yaml.safe_load(f)

file_one_path = config["general"]["file_one_path"]
file_two_path = config["general"]["file_two_path"]
label = config["general"]["label"]

from utils.utils import (
    load_data,
    dedup,
    prepare_train_test_split
)

# Load and Clean Data

In [None]:
# Load dataset
train_df, test_df = load_data(file_one_path, file_two_path)

# Deduplicate
train_df = dedup(train_df)
if test_df is not None:
    test_df = dedup(test_df)

In [None]:
# Interaction example
# df['Feature1_x_Feature2'] = df['Feature1'] * df['Feature2']

# Binning example
# df['AgeBin'] = pd.qcut(df['Age'], q=4, labels=False, duplicates='drop')

# Flag example
# df['IsHighValue'] = (df['PurchaseAmount'] > 1000).astype(int)

In [None]:
X, y, X_train, X_test, y_train, y_test = prepare_train_test_split(train_df, test_df)

# Save the datasets for further use
joblib.dump(X, SAVE_DIR / "X.pkl")
joblib.dump(y, SAVE_DIR / "y.pkl")
joblib.dump(X_train, SAVE_DIR / "X_train.pkl")
joblib.dump(y_train, SAVE_DIR / "y_train.pkl")
joblib.dump(X_test, SAVE_DIR / "X_test.pkl")
joblib.dump(y_test, SAVE_DIR / "y_test.pkl")

print("Successfully saved 8 files.")