# Project Overview

#### Problem Statement
Briefly describe the prediction task and why it matters for the business or users.

#### Dataset Description
Describe the dataset size, source, and feature types (numerical, categorical, text, etc.).

#### Target Variable and Eval Metrics
Define the target variable clearly. Specify the primary evaluation metric(s) and why they are appropriate for the problem.

# Imports and Setup

In [None]:
from pathlib import Path
import sys
import os
import yaml
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder

# Set project root
ROOT = Path().resolve().parent
sys.path.append(str(ROOT))

# Load config from YAML
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Extract values from config
file_one_path = config["general"]["file_one_path"]
file_two_path = config["general"]["file_two_path"]
label = config["general"]["label"]
mode = config["general"]["mode"]

# Ensure save directory exists
SAVE_DIR = ROOT / "saved"
SAVE_DIR.mkdir(exist_ok=True)

# Import data utility functions
from utils.utils import (
    load_data,
    dedup,
    prepare_train_test_split
)

# Load and Clean Data

In [None]:
# Load dataset
train_df, test_df = load_data(file_one_path, file_two_path)

# Deduplicate
train_df = dedup(train_df)
if test_df is not None:
    test_df = dedup(test_df)

In [None]:
# Apply to both train and test if test exists
for df in [train_df, test_df]:
    if df is not None:
        # Interaction example
        # df['Feature1_x_Feature2'] = df['Feature1'] * df['Feature2']

        # Binning example
        # df['AgeBin'] = pd.qcut(df['Age'], q=4, labels=False, duplicates='drop')

        # Flag example
        # df['IsHighValue'] = (df['PurchaseAmount'] > 1000).astype(int)
        
        pass

In [None]:
# Remove classes that only appear once in train
counts = train_df[label].value_counts()
valid_classes = counts[counts >= 2].index

train_df = train_df[train_df[label].isin(valid_classes)]

le = LabelEncoder()
if test_df is not None and label in test_df.columns:
    le.fit(pd.concat([train_df[label], test_df[label]]))
else:
    le.fit(train_df[label])

joblib.dump(le, SAVE_DIR / "label_encoder.pkl")
    

# Split data as needed
X, y, X_train, X_test, y_train, y_test, X_val, y_val = prepare_train_test_split(train_df, test_df, label, le)

# Save variables to be used in next notebook
joblib.dump(X, SAVE_DIR / "X.pkl")
joblib.dump(y, SAVE_DIR / "y.pkl")
joblib.dump(X_train, SAVE_DIR / "X_train.pkl")
joblib.dump(y_train, SAVE_DIR / "y_train.pkl")
joblib.dump(X_test, SAVE_DIR / "X_test.pkl")
joblib.dump(y_test, SAVE_DIR / "y_test.pkl")
joblib.dump(X_val, SAVE_DIR / "X_val.pkl")
joblib.dump(y_val, SAVE_DIR / "y_val.pkl")
print("Successfully saved 8 files.")