# Preprocessing

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

## 1. Load Data

In [None]:
import kagglehub

path = kagglehub.dataset_download("harlfoxem/housesalesprediction")
csv_path = Path(path) / "kc_house_data.csv"
df = pd.read_csv(csv_path)
print(f"Dataset shape: {df.shape}")

Dataset shape: (21613, 21)


## 2. Parse Date and Sort Temporally

Temporal ordering is **critical** for proper train/val/test splitting.

In [None]:
# Parse date column
df["date_parsed"] = pd.to_datetime(df["date"].str[:8], format="%Y%m%d")

# Note: The temporal_train_val_test_split function handles sorting internally,
# but we sort here explicitly for pedagogical clarity and to show the date range.
df = df.sort_values("date_parsed").reset_index(drop=True)

print(f"Date range: {df['date_parsed'].min().date()} to {df['date_parsed'].max().date()}")


Date range: 2014-05-02 to 2015-05-27


## 3. Temporal Train/Validation/Test Split

We use a **three-way temporal split** to avoid both:
1. **Temporal leakage** (training on future data)
2. **Model selection leakage** (using test set for hyperparameter tuning)

For full rationale, see `p3-03-temporal_leakage.ipynb`.

In [None]:
def temporal_train_val_test_split(df: pd.DataFrame, 
                                   date_column: str,
                                   val_size: float = 0.15, 
                                   test_size: float = 0.15) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split data temporally into train, validation, and test sets.
    
    Data is sorted by the specified date column internally, ensuring proper temporal ordering.
    
    Parameters
    ----------
    df : DataFrame
        Data to split (will be sorted internally by date_column).
    date_column : str
        Name of the column containing datetime values for temporal ordering.
    val_size : float
        Proportion for validation set (default 0.15).
    test_size : float  
        Proportion for test set (default 0.15).
        
    Returns
    -------
    tuple of (train_df, val_df, test_df)
        All dataframes are sorted chronologically.
    """
    # Sort by date to ensure temporal ordering
    df_sorted = df.sort_values(date_column).reset_index(drop=True)
    
    n = len(df_sorted)
    train_end = int(n * (1 - val_size - test_size))
    val_end = int(n * (1 - test_size))
    
    train_df = df_sorted.iloc[:train_end].copy()
    val_df = df_sorted.iloc[train_end:val_end].copy()
    test_df = df_sorted.iloc[val_end:].copy()
    
    return train_df, val_df, test_df


In [None]:
# Apply the split
train_df, val_df, test_df = temporal_train_val_test_split(df, date_column="date_parsed", val_size=0.15, test_size=0.15)

print("Split sizes:")
print(f"  Train: {len(train_df):,} records ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Val:   {len(val_df):,} records ({len(val_df)/len(df)*100:.1f}%)")
print(f"  Test:  {len(test_df):,} records ({len(test_df)/len(df)*100:.1f}%)")
print()
print("Date ranges:")
print(f"  Train: {train_df['date_parsed'].min().date()} to {train_df['date_parsed'].max().date()}")
print(f"  Val:   {val_df['date_parsed'].min().date()} to {val_df['date_parsed'].max().date()}")
print(f"  Test:  {test_df['date_parsed'].min().date()} to {test_df['date_parsed'].max().date()}")


Split sizes:
  Train: 15,129 records (70.0%)
  Val:   3,242 records (15.0%)
  Test:  3,242 records (15.0%)

Date ranges:
  Train: 2014-05-02 to 2015-01-16
  Val:   2015-01-16 to 2015-03-26
  Test:  2015-03-26 to 2015-05-27


## 4. Feature Engineering

Based on EDA findings, we'll:
1. Drop non-predictive columns
2. Extract temporal features
3. Create derived features

### 4.1 Drop Non-Predictive Columns

- **`id`**: Property identifier with 21,000+ unique values. Cannot be one-hot encoded (dimensionality explosion). Not predictive. **Must be dropped.** See `p3-02-repeated_ids.ipynb`.
- **`date`**: Original string format (will extract features first)
- **`zipcode`**: High cardinality categorical. We'll use lat/long instead.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Feature engineering transformer for house price prediction.
    
    Fits reference values (min_date) on training data to avoid data leakage.
    
    Parameters
    ----------
    None
    
    Attributes
    ----------
    min_date_ : Timestamp
        Minimum date from training data, used to compute days_since_start.
    """
    
    def __init__(self):
        pass
    
    def fit(self, X: pd.DataFrame, y=None):
        """
        Fit the transformer by learning reference values from training data.
        
        Parameters
        ----------
        X : DataFrame
            Training data with 'date_parsed' column.
        y : array-like, optional
            Target variable (not used).
            
        Returns
        -------
        self
        """
        # Store the minimum date from training data
        # This prevents data leakage when transforming val/test sets
        self.min_date_ = X["date_parsed"].min()
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Apply feature engineering transformations.
        
        Parameters
        ----------
        X : DataFrame
            Data with parsed date and raw features.
            
        Returns
        -------
        DataFrame
            Transformed data with engineered features.
        """
        X = X.copy()
        
        # Temporal feature: days since first sale in TRAINING dataset
        # Uses fitted min_date_ to ensure consistency across splits
        X["days_since_start"] = (X["date_parsed"] - self.min_date_).dt.days
        
        # Age at sale
        sale_year = X["date_parsed"].dt.year
        X["house_age"] = sale_year - X["yr_built"]
        
        # Was renovated (binary)
        X["was_renovated"] = (X["yr_renovated"] > 0).astype(int)
        
        # Years since renovation (0 if never renovated)
        X["years_since_renovation"] = np.where(
            X["yr_renovated"] > 0,
            sale_year - X["yr_renovated"],
            0
        )
        
        # Basement ratio
        X["basement_ratio"] = X["sqft_basement"] / X["sqft_living"].replace(0, 1)
        
        # Living area vs neighbors
        X["living_vs_neighbors"] = X["sqft_living"] / X["sqft_living15"].replace(0, 1)
        
        # Lot area vs neighbors
        X["lot_vs_neighbors"] = X["sqft_lot"] / X["sqft_lot15"].replace(0, 1)
        
        # Drop columns
        columns_to_drop = [
            "id",           # Not predictive (property identifier)
            "date",         # Original string (extracted features)
            "date_parsed",  # Used for splitting only
            "zipcode",      # High cardinality (using lat/long)
            "yr_built",     # Replaced by house_age
            "yr_renovated", # Replaced by was_renovated, years_since_renovation
        ]
        
        X = X.drop(columns=columns_to_drop)
        
        return X

## 4.2 Separate Features and Target

In [None]:
# Separate features and target
target = "price"

X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_val = val_df.drop(columns=[target])
y_val = val_df[target]

X_test = test_df.drop(columns=[target])
y_test = test_df[target]

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val:   {X_val.shape}, y_val:   {y_val.shape}")
print(f"X_test:  {X_test.shape}, y_test:  {y_test.shape}")

X_train: (15129, 22), y_train: (15129,)
X_val:   (3242, 22), y_val:   (3242,)
X_test:  (3242, 22), y_test:  (3242,)


## 5. Build Complete Preprocessing Pipeline

We'll create a complete scikit-learn pipeline that includes:
1. Feature engineering (custom transformer)
2. Log transformation for skewed features
3. Standard scaling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

In [None]:
# First, temporarily fit the feature engineer to see what features we'll have
temp_engineer = FeatureEngineer()
temp_engineer.fit(X_train)
temp_X = temp_engineer.transform(X_train)

# Identify feature groups from the engineered features
log_features = ["sqft_living", "sqft_lot", "sqft_above", "sqft_basement", 
                "sqft_living15", "sqft_lot15"]

# Features that should not be scaled (already on reasonable scale or binary)
passthrough_features = ["waterfront", "was_renovated"]

# All other numeric features
scale_features = [col for col in temp_X.columns 
                  if col not in log_features + passthrough_features]

print(f"Log features ({len(log_features)}): {log_features}")
print(f"Passthrough ({len(passthrough_features)}): {passthrough_features}")
print(f"Scale only ({len(scale_features)}): {scale_features}")

Log features (6): ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
Passthrough (2): ['waterfront', 'was_renovated']
Scale only (14): ['bedrooms', 'bathrooms', 'floors', 'view', 'condition', 'grade', 'lat', 'long', 'days_since_start', 'house_age', 'years_since_renovation', 'basement_ratio', 'living_vs_neighbors', 'lot_vs_neighbors']


In [None]:
# Log transform pipeline
log_pipeline = Pipeline([
    ("log", FunctionTransformer(np.log1p, validate=True, feature_names_out="one-to-one")), 
    ("scale", StandardScaler())
])

# Numeric preprocessing (log+scale, scale only, and passthrough)
numeric_preprocessor = ColumnTransformer([
    ("log", log_pipeline, log_features),
    ("scale", StandardScaler(), scale_features),
    ("passthrough", "passthrough", passthrough_features)
])

# Complete pipeline: feature engineering → numeric preprocessing
full_pipeline = Pipeline([
    ("feature_engineering", FeatureEngineer()),
    ("preprocessing", numeric_preprocessor)
])

print("Complete pipeline structure:")
print(full_pipeline)

The `feature_names_out="one-to-one"` parameter tells sklearn that the transformer produces the same number of output features as input features with the same names. This enables get_feature_names_out() to work properly.

In [None]:
# Fit the complete pipeline on training data ONLY
# This learns: min_date, scaling parameters, etc.
full_pipeline.fit(X_train, y_train)

# Transform all sets using the fitted pipeline
X_train_processed = full_pipeline.transform(X_train)
X_val_processed = full_pipeline.transform(X_val)
X_test_processed = full_pipeline.transform(X_test)

print(f"Processed shapes: Train={X_train_processed.shape}, Val={X_val_processed.shape}, Test={X_test_processed.shape}")
print(f"\nFitted min_date from training: {full_pipeline.named_steps['feature_engineering'].min_date_.date()}")

Processed shapes: Train=(15129, 22), Val=(3242, 22), Test=(3242, 22)


### 5.1 Verify Feature Names After Transformation

In [None]:
# Get feature names from the numeric preprocessing step (after feature engineering)
feature_names = full_pipeline.named_steps['preprocessing'].get_feature_names_out()
print(f"Total features: {len(feature_names)}")
print(f"Feature names: {list(feature_names)}")

Total features: 22
Feature names: ['log__sqft_living', 'log__sqft_lot', 'log__sqft_above', 'log__sqft_basement', 'log__sqft_living15', 'log__sqft_lot15', 'scale__bedrooms', 'scale__bathrooms', 'scale__floors', 'scale__view', 'scale__condition', 'scale__grade', 'scale__lat', 'scale__long', 'scale__days_since_start', 'scale__house_age', 'scale__years_since_renovation', 'scale__basement_ratio', 'scale__living_vs_neighbors', 'scale__lot_vs_neighbors', 'passthrough__waterfront', 'passthrough__was_renovated']


## 6. Save Processed Data

Save for use in the modeling notebook.

In [None]:
import joblib

output_dir = Path("processed_data")
output_dir.mkdir(exist_ok=True)

# Save processed arrays
np.save(output_dir / "X_train.npy", X_train_processed)
np.save(output_dir / "X_val.npy", X_val_processed)
np.save(output_dir / "X_test.npy", X_test_processed)
np.save(output_dir / "y_train.npy", y_train.values)
np.save(output_dir / "y_val.npy", y_val.values)
np.save(output_dir / "y_test.npy", y_test.values)

# Save COMPLETE pipeline for inference (includes feature engineering + preprocessing)
joblib.dump(full_pipeline, output_dir / "preprocessor.joblib")

# Save feature names
np.save(output_dir / "feature_names.npy", feature_names)

print(f"Saved processed data to {output_dir.absolute()}")
print(f"\nSaved pipeline can now transform raw data (with date_parsed) end-to-end.")

Saved processed data to /media/NOCTURNOEXTRA/Alejandro/wip-clase/PIA-SAA/example_repos/king-county/processed_data


## 7. Summary

### What was done

1. **Temporal split** (70% train / 15% val / 15% test)
   - Train: oldest data
   - Validation: middle period (for model selection)
   - Test: most recent (for final evaluation only)

2. **Feature engineering** (as sklearn transformer)
   - **Custom `FeatureEngineer` transformer** ensures consistency across splits
   - Fits `min_date_` on training data (prevents data leakage)
   - Creates `days_since_start` using training reference
   - Creates derived features (house_age, was_renovated, ratios, etc.)
   - Drops non-predictive columns (id, zipcode, etc.)

3. **Complete sklearn pipeline**
   - **Stage 1**: Feature engineering (custom transformer)
   - **Stage 2**: Log + scale for square footage features
   - **Stage 3**: Standard scaling for other numeric
   - **Stage 4**: Passthrough for binary features

### Files saved

```
processed_data/
├── X_train.npy
├── X_val.npy
├── X_test.npy
├── y_train.npy
├── y_val.npy
├── y_test.npy
├── preprocessor.joblib  # COMPLETE pipeline (feature eng + preprocessing)
└── feature_names.npy
```

### Key improvements

- ✅ **No data leakage**: `min_date_` fitted on training data only
- ✅ **Reproducible inference**: Complete pipeline saved, can transform raw data
- ✅ **sklearn best practices**: All transformations encapsulated in pipeline
- ✅ **Coherent design**: fit() on train, transform() on all splits

### Next steps

Continue to modeling notebook for model training and evaluation.