# California Housing Price Prediction
Complete Machine Learning Pipeline with Random Forest

In [None]:
%%capture --no-display
# Dependency installation (if needed)
# !pip install shap seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import shap
import logging
import warnings

warnings.filterwarnings("ignore")
logging.basicConfig(
    filename='housing.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

## 1. Data Loading & Cleaning

In [None]:
class DataLoader:
    """Robust data loader"""

    def __init__(self, url):
        self.url = url
        self.df = None

    def load_data(self):
        """Load and validate data"""
        try:
            self.df = pd.read_csv(self.url)
            self._clean_data()
            logging.info(f"Data loaded successfully. Shape: {self.df.shape}")
            return self.df
        except Exception as e:
            logging.error(f"Data loading failed: {str(e)}")
            raise

    def _clean_data(self):
        """Data cleaning operations"""
        # Handle missing values
        self.df = self.df.dropna(subset=['median_income', 'housing_median_age'])

        # Handle outliers
        self.df['median_house_value'] = self.df['median_house_value'].clip(upper=500000)

        # Ensure valid values
        self.df['total_rooms'] = self.df['total_rooms'].replace(0, 1)
        self.df['households'] = self.df['households'].replace(0, 1)

In [None]:
# Execute data loading
loader = DataLoader("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv")
df = loader.load_data()

# Display basic info
print("Data shape:", df.shape)
df.head()

## 2. Feature Engineering

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Reusable feature engineering pipeline"""

    def __init__(self):
        self.feature_names_ = []

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Generate new features"""
        df = X.copy()

        # Spatial features (log transformed)
        df['rooms_per_household'] = np.log1p(df['total_rooms'] / df['households'])
        df['bedrooms_ratio'] = np.log1p(df['total_bedrooms'] / df['total_rooms'])

        # Temporal feature
        df['house_age'] = 2023 - df['housing_median_age']

        # Geographical feature
        df['distance_to_coast'] = np.sqrt(
            (df['latitude'] - 34.42) ​**​ 2 +
            (df['longitude'] + 118.49) ​**​ 2
        )

        # Feature selection
        self.feature_names_ = [
            'median_income', 'house_age', 'rooms_per_household',
            'bedrooms_ratio', 'distance_to_coast', 'ocean_proximity'
        ]
        return df[self.feature_names_]

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_

In [None]:
# Execute feature engineering
engineer = FeatureEngineer()
X = engineer.fit_transform(df)
y = df['median_house_value']

# Visualize feature distribution
plt.figure(figsize=(12, 6))
sns.histplot(X['rooms_per_household'], kde=True)
plt.title('Rooms per Household Distribution')
plt.show()

## 3. Preprocessing Pipeline

In [None]:
class SafeOneHotEncoder(BaseEstimator, TransformerMixin):
    """Robust categorical encoder"""

    def __init__(self):
        self.categories_ = {}
        self.feature_names_out_ = []

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        self.categories_ = {
            col: X_df[col].unique().tolist()
            for col in X_df.columns
        }
        self.feature_names_out_ = [
            f"{col}_{cat}"
            for col in X_df.columns
            for cat in sorted(self.categories_[col])
        ]
        return self

    def transform(self, X):
        X_df = pd.DataFrame(X)
        dummies = pd.get_dummies(X_df, prefix_sep='_')
        for col in self.feature_names_out_:
            if col not in dummies.columns:
                dummies[col] = 0
        return dummies[self.feature_names_out_]

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out_

In [None]:
def build_preprocessor():
    """Construct preprocessing pipeline"""
    num_features = ['median_income', 'house_age',
                    'rooms_per_household', 'bedrooms_ratio',
                    'distance_to_coast']

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    cat_pipeline = Pipeline([
        ('onehot', SafeOneHotEncoder())
    ])

    return ColumnTransformer([
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, ['ocean_proximity'])
    ])

# Initialize preprocessor
preprocessor = build_preprocessor()

## 4. Model Training & Evaluation

In [None]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names
num_features = ['median_income', 'house_age', 
                'rooms_per_household', 'bedrooms_ratio', 'distance_to_coast']
cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out()
all_features = num_features + cat_features

print("Final feature count:", len(all_features))

In [None]:
def train_model(X_train, y_train):
    """Model training with hyperparameter tuning"""
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5]
    }

    grid_search = GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    logging.info(f"Best parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Train model
model = train_model(X_train_processed, y_train)
print("Best model parameters:", model.get_params())

In [None]:
# Model evaluation
y_pred = model.predict(X_test_processed)

print("\n=== Model Performance ===")
print(f"MAE: ${mean_absolute_error(y_test, y_pred):,.0f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

## 5. Results Visualization

In [None]:
def visualize_results(model, X_test, feature_names):
    """Generate SHAP explanation plots"""
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    plt.figure(figsize=(12, 6))
    shap.summary_plot(shap_values, X_test, feature_names=feature_names)
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()

# Generate visualizations
visualize_results(model, X_test_processed, all_features)

## 6. Model Deployment

In [None]:
# Save models
joblib.dump(model, 'final_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("Saved model files:")
!ls -lh *.pkl

## 7. Log Inspection

In [None]:
!tail -n 20 housing.log