<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-422-srilatha/DFpractice_HousingPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [42]:
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df_raw = pd.read_csv(url)


In [43]:
print("Missing values:\n", df_raw.isnull().sum())


Missing values:
 longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [44]:
# Step 3: Cleaning Function
def clean_data(df):
    df = df.copy()

    # Impute missing total_bedrooms with median
    if "total_bedrooms" in df.columns:
        df["total_bedrooms"] = df["total_bedrooms"].fillna(df["total_bedrooms"].median())

    return df

In [45]:
print("Missing values:\n", df_raw.isnull().sum())


Missing values:
 longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [46]:

# Step 4: Feature Engineering Function
def engineer_features(df):
    df = df.copy()

    # New derived features
    df["rooms_per_household"] = df["total_rooms"] / df["households"]
    df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
    df["population_per_household"] = df["population"] / df["households"]

    # Split categorical and numerical
    cat_col = ["ocean_proximity"]
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # One-hot encode + scale
    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(sparse_output=False), cat_col)
    ])

    df_processed = preprocessor.fit_transform(df)

    # Get feature names
    ohe_features = preprocessor.named_transformers_["cat"].get_feature_names_out(cat_col)
    final_features = num_cols + list(ohe_features)

    df_final = pd.DataFrame(df_processed, columns=final_features)

    return df_final

In [47]:
# Step 5: Apply functions
df_cleaned = clean_data(df_raw)
df_final = engineer_features(df_cleaned)

In [49]:
# Step 6: View results
print("Final Processed DataFrame Shape:", df_final.shape)
df_final.head()

Final Processed DataFrame Shape: (20640, 17)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,0.628559,-1.029988,-0.049597,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,0.327041,-0.888897,-0.092512,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,1.15562,-1.291686,-0.025843,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.449613,-0.050329,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.639087,-0.085616,0.0,0.0,0.0,1.0,0.0
