# End-to-End Machine Learning Project
## Lesson 2.5 â€” Data Preparation & Feature Engineering

This notebook focuses on transforming raw training data into a form suitable
for machine learning models.

We handle:
- missing values
- feature scaling
- categorical encoding
- reproducible preprocessing pipelines

All transformations are learned **only from the training data**.


In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.datasets import fetch_california_housing

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [2]:
housing = fetch_california_housing(as_frame=True)

housing_df = housing.data.copy()
housing_df['target'] = housing.target

housing_df['income_cat'] = pd.cut(
    housing_df['MedInc'],
    bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(housing_df, housing_df['income_cat']):
    train_set = housing_df.loc[train_idx].drop('income_cat', axis=1)

In [3]:
X_train = train_set.drop('target', axis=1)
y_train = train_set['target'].copy()

In [5]:
imputer = SimpleImputer(strategy='median')

X_train_imputed = imputer.fit_transform(X_train)

In [6]:
X_train_imputed = pd.DataFrame(
    X_train_imputed,
    columns = X_train.columns,
    index = X_train.index
)

In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_imputed)

In [8]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [9]:
X_train_prepared = num_pipeline.fit_transform(X_train)