In [1]:
# Step 1: Install and Import Libraries
!pip install pandas numpy -q

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Step 2: Load Dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)
df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
def clean_data(df):
    df = df.copy()

    # Impute missing values in total_bedrooms
    if 'total_bedrooms' in df.columns:
        df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())

    # Drop ocean_proximity outliers (if any specific categories should be excluded)
    # For now, we assume no action needed unless clearly incorrect values exist.
    valid_categories = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
    df = df[df['ocean_proximity'].isin(valid_categories)]

    return df


In [4]:
def engineer_features(df):
    df = df.copy()

    # Create ratio-based features
    df['rooms_per_household'] = df['total_rooms'] / df['households']
    df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']

    # Bin median_income into quartiles
    df['income_quartile'] = pd.qcut(df['median_income'], 4, labels=False)

    # Log-transform target variable
    df['log_median_house_value'] = np.log(df['median_house_value'].replace(0, np.nan))
    df['log_median_house_value'] = df['log_median_house_value'].replace([np.inf, -np.inf], np.nan).fillna(0)

    return df


In [5]:
def validate_data(df):
    errors = []

    # Check for non-positive values in ratio features
    ratio_cols = ['rooms_per_household', 'bedrooms_per_room', 'population_per_household']
    for col in ratio_cols:
        if (df[col] <= 0).any():
            errors.append(f"Non-positive values found in: {col}")

    # Check log-transformed values are finite
    if not np.isfinite(df['log_median_house_value']).all():
        errors.append("Non-finite values in log_median_house_value")

    return errors


In [6]:
df_clean = clean_data(df)
df_feat = engineer_features(df_clean)
validation_errors = validate_data(df_feat)

print("Validation Errors:", validation_errors or "None")
df_feat.head()


Validation Errors: None


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,income_quartile,log_median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,3,13.022764
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,3,12.789684
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226,3,12.771671
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,3,12.740517
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,2,12.743151


In [7]:
df_feat.to_csv("house_price_prepared.csv", index=False)
