In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
URL = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
housing = pd.read_csv(URL)

In [3]:
def inspect_data(df):
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\nDescriptive stats:")
    print(df.describe())
    print("\nValue counts for 'ocean_proximity':")
    print(df["ocean_proximity"].value_counts())

inspect_data(housing)


Missing values:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Descriptive stats:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households 

In [4]:
def add_features(df):
    df = df.copy()
    df["rooms_per_household"] = df["total_rooms"] / df["households"]
    df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
    df["population_per_household"] = df["population"] / df["households"]
    return df

In [5]:
def create_preprocessing_pipeline():
    num_features = ["longitude", "latitude", "housing_median_age", "total_rooms",
                    "total_bedrooms", "population", "households", "median_income",
                    "rooms_per_household", "bedrooms_per_room", "population_per_household"]
    cat_features = ["ocean_proximity"]

    # Numerical pipeline
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # Categorical pipeline
    cat_pipeline = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Combine pipelines
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features)
    ])

    return full_pipeline

In [6]:
def prepare_data(df):
    df = add_features(df)
    pipeline = create_preprocessing_pipeline()
    processed_data = pipeline.fit_transform(df)
    return processed_data

# Run the pipeline
processed_housing_data = prepare_data(housing)
print("\nProcessed data shape:", processed_housing_data.shape)


Processed data shape: (20640, 16)
