# Project Overview

#### Problem Statement
Briefly describe the prediction task and why it matters for the business or users.

#### Dataset Description
Describe the dataset size, source, and feature types (numerical, categorical, text, etc.).

#### Target Variable and Eval Metrics
Define the target variable clearly. Specify the primary evaluation metric(s) and why they are appropriate for the problem.

# Imports and Setup

In [32]:
import sys
sys.path.append('../')

# Reset variables and functions from other scripts
import importlib
import utils.utils as utils
importlib.reload(utils)
import utils.config as config
importlib.reload(config)

import os
os.makedirs('saved', exist_ok=True)

import pandas as pd



# Project-specific utilities
from utils.config import (
    file_one_path,
    file_two_path,
    label
)

from utils.utils import (
    load_data,
    dedup,
    prepare_train_test_split
)

import joblib

# Load and Clean Data

In [None]:
# Load dataset
train_df, test_df = load_data(file_one_path, file_two_path)

# Deduplicate
train_df = dedup(train_df)
if test_df is not None:
    test_df = dedup(test_df)

Successfully deleted 877 duplicated examples.
Successfully saved 6 files.


In [None]:
# Apply to both train and test if test exists
for df in [train_df, test_df]:
    if df is not None:
        # Interaction example
        # df['Feature1_x_Feature2'] = df['Feature1'] * df['Feature2']

        # Binning example
        # df['AgeBin'] = pd.qcut(df['Age'], q=4, labels=False, duplicates='drop')

        # Date/time example
        # df['SignupMonth'] = pd.to_datetime(df['SignupDate'], errors='coerce').dt.month

        # Flag example
        # df['IsHighValue'] = (df['PurchaseAmount'] > 1000).astype(int)

In [None]:
# Split data as needed
X, y, X_train, X_test, y_train, y_test, _ = prepare_train_test_split(train_df, test_df, label)

# Save variables to be used in next notebook
joblib.dump(X, 'saved/X.pkl')
joblib.dump(y, 'saved/y.pkl')
joblib.dump(X_train, 'saved/X_train.pkl')
joblib.dump(y_train, 'saved/y_train.pkl')
joblib.dump(X_test, 'saved/X_test.pkl')
joblib.dump(y_test, 'saved/y_test.pkl')
print("Successfully saved 6 files.")