# End-to-End Machine Learning Project
## Lesson 2.6 — Model Training & Baseline Models

This notebook trains baseline and simple machine learning models to establish
reference performance levels.

The goal is to:
- avoid misleading results
- compare models fairly
- understand bias–variance behavior in practice


In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

In [None]:
housing = fetch_california_housing(as_frame=True)

housing_df = housing.data.copy()
housing_df["target"] = housing.target

housing_df["IncomeCat"] = pd.cut(
    housing_df["MedInc"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(housing_df, housing_df["IncomeCat"]):
    train_set = housing_df.loc[train_idx].drop("IncomeCat", axis=1)

In [4]:
X_train = train_set.drop("target", axis=1)
y_train = train_set["target"].copy()

In [5]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

In [6]:
baseline_model = Pipeline([
    ("preprocessing", num_pipeline),
    ("model", DummyRegressor(strategy="mean"))
])


In [7]:
baseline_scores = cross_val_score(
    baseline_model,
    X_train,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=5
)


In [8]:
baseline_rmse = -baseline_scores.mean()
baseline_rmse


np.float64(1.1570227733753982)

In [9]:
lin_reg_model = Pipeline([
    ("preprocessing", num_pipeline),
    ("model", LinearRegression())
])


In [10]:
lin_reg_scores = cross_val_score(
    lin_reg_model,
    X_train,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=5
)

lin_reg_rmse = -lin_reg_scores.mean()
lin_reg_rmse


np.float64(0.7311763452286281)