In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

Data Preprocessing

In [None]:
df = pd.read_csv('housing.csv')

# Fill missing values with median
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
# Encode categorical column 'ocean_proximity'
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)

In [None]:
# Feature & target selection
X_data = df.drop('median_house_value', axis=1)
y_data = df['median_house_value']

# Normalize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_data, test_size=0.2, random_state=42)


Model Implementation

In [None]:
model_linear = LinearRegression()
model_forest = RandomForestRegressor(random_state=42)
model_boost = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

model_linear.fit(X_train, y_train)
model_forest.fit(X_train, y_train)
model_boost.fit(X_train, y_train)

Performance Comparison

In [None]:
pred_linear = model_linear.predict(X_test)
pred_forest = model_forest.predict(X_test)
pred_boost = model_boost.predict(X_test)

# RMSE and R²
def evaluate_model(name, y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate_model("Linear Regression", y_test, pred_linear)
evaluate_model("Random Forest", y_test, pred_forest)
evaluate_model("XGBoost", y_test, pred_boost)


Feature Importance

In [None]:
features = X_data.columns

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.barh(features, model_forest.feature_importances_)
plt.title("Random Forest Feature Importance")

plt.subplot(1, 2, 2)
plt.barh(features, model_boost.feature_importances_)
plt.title("XGBoost Feature Importance")

plt.tight_layout()
plt.show()


Model Testing

In [None]:
print("\n Predict Median House Value")
print("Enter values for the following features:\n")

input_fields = [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income'
]

user_inputs = {}
for field in input_fields:
    while True:
        try:
            user_inputs[field] = float(input(f"{field}: "))
            break
        except ValueError:
            print(" Please enter a numeric value.")

print("\nSpecify proximity to ocean (True/False):")
ocean_features = [
    'ocean_proximity_INLAND',
    'ocean_proximity_ISLAND',
    'ocean_proximity_NEAR BAY',
    'ocean_proximity_NEAR OCEAN'
]

for feature in ocean_features:
    while True:
        val = input(f"{feature}: ").strip().lower()
        if val in ['true', 'false']:
            user_inputs[feature] = val == 'true'
            break
        else:
            print(" Please enter either 'True' or 'False'.")

# Construct full input vector with default 0s
model_features = X_data.columns.tolist()
test_input = {col: 0 for col in model_features}
test_input.update(user_inputs)

# Prepare input DataFrame
input_df = pd.DataFrame([test_input])

# Scale numerical values
input_scaled = scaler.transform(input_df)
input_scaled_df = pd.DataFrame(input_scaled, columns=model_features)

# Predict
predicted_price = model_boost.predict(input_scaled_df)[0]

print("\n Predicted Median House Value:")
print(f"${predicted_price:,.2f}")


 Predict Median House Value
Enter values for the following features:

