# ðŸ§¹ Data Preprocessing & Feature Engineering

> Competition: {{ COMPETITION_NAME }}

---

## ðŸ“¦ Setup

In [None]:
import sys
sys.path.append('../../..')

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from shared.utils import set_seed, reduce_memory_usage
from shared.data import (
    load_competition_data,
    create_folds,
    basic_preprocessing,
    save_processed_data,
    get_feature_types
)

set_seed(42)

%load_ext autoreload
%autoreload 2

## ðŸ“‚ Load Data

In [None]:
COMPETITION_PATH = '.'
TARGET_COL = 'target'  # Update this
ID_COL = 'id'  # Update this

train, test = load_competition_data(COMPETITION_PATH)

## ðŸ§¹ Data Cleaning

In [None]:
# Basic preprocessing
train, test = basic_preprocessing(train, test, target_col=TARGET_COL)

### Handle Missing Values

In [None]:
# Check missing values
print("Train missing values:")
print(train.isnull().sum()[train.isnull().sum() > 0])

print("\nTest missing values:")
print(test.isnull().sum()[test.isnull().sum() > 0])

In [None]:
# Fill missing values strategy
# Numerical: median
# Categorical: mode or 'Unknown'

feature_types = get_feature_types(train)

for col in feature_types['numerical']:
    median_val = train[col].median()
    train[col] = train[col].fillna(median_val)
    test[col] = test[col].fillna(median_val)

for col in feature_types['categorical']:
    train[col] = train[col].fillna('Unknown')
    test[col] = test[col].fillna('Unknown')

## ðŸ”§ Feature Engineering

In [None]:
def create_features(df):
    """Create new features."""
    df = df.copy()
    
    # Add your feature engineering here
    # Example:
    # df['feature_sum'] = df['feat1'] + df['feat2']
    # df['feature_ratio'] = df['feat1'] / (df['feat2'] + 1e-8)
    
    return df

train = create_features(train)
test = create_features(test)

### Encode Categorical Variables

In [None]:
# Label encoding for categorical features
cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

for col in cat_cols:
    le = LabelEncoder()
    # Fit on combined data
    le.fit(list(train[col].astype(str)) + list(test[col].astype(str)))
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

print(f"Encoded {len(cat_cols)} categorical columns")

## ðŸ“Š Create CV Folds

In [None]:
# Create stratified folds
train = create_folds(train, target_col=TARGET_COL, n_folds=5, stratified=True)

In [None]:
# Verify fold distribution
train.groupby('fold')[TARGET_COL].mean()

## ðŸ’¾ Save Processed Data

In [None]:
# Reduce memory
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

In [None]:
save_processed_data(train, COMPETITION_PATH, 'train_processed.csv')
save_processed_data(test, COMPETITION_PATH, 'test_processed.csv')

In [None]:
print(f"\nFinal train shape: {train.shape}")
print(f"Final test shape: {test.shape}")
print(f"\nFeatures: {[c for c in train.columns if c not in [TARGET_COL, 'fold', ID_COL]]}")

---
**Next Steps**: Proceed to `03_modeling.ipynb`