In [1]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 1.4 MB/s eta 0:00:53
   ---------------------------------------- 0.8/72.0 MB 1.5 MB/s eta 0:00:47
    --------------------------------------- 1.3/72.0 MB 1.7 MB/s eta 0:00:43
   - -------------------------------------- 1.8/72.0 MB 1.9 MB/s eta 0:00:38
   - -------------------------------------- 2.1/72.0 MB 1.9 MB/s eta 0:00:36
   - -------------------------------------- 2.9/72.0 MB 2.1 MB/s eta 0:00:34
   - -------------------------------------- 3.4/72.0 MB 2.1 MB/s eta 0:00:33
   -- ------------------------------------- 3.9/72.0 MB 2.2 MB/s eta 0:00:32
   -- ------------------------------------- 4.5/72.0 MB 2.2 MB/s eta 0:00:31
   -- --------------

In [2]:
# ----------------------------------------------------
# HOUSE PRICE PREDICTION — XGBOOST (BEST ACCURACY)
# ----------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

# 1. Load dataset
df = pd.read_csv("housing.csv")

# 2. Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# 3. Feature engineering
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]

# 4. One-hot encode ocean proximity
df = pd.get_dummies(df, columns=["ocean_proximity"], drop_first=True)

# 5. Split data
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. XGBoost Model — Tuned parameters for best R²
xgb = XGBRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_train, y_train)

# 7. Evaluate model accuracy
y_pred = xgb.predict(X_test)
print("\n----------------------------------------")
print("XGBoost Model Accuracy (R² Score):", r2_score(y_test, y_pred))
print("----------------------------------------")

# 8. USER INPUT for prediction
fixed_longitude = -122.23
fixed_latitude = 37.88

print("\nEnter House Details for Prediction:\n")

housing_median_age = float(input("Housing Median Age: "))
rooms_per_house = float(input("Rooms per House: "))
bedrooms_per_house = float(input("Bedrooms per House: "))
population = float(input("Population in the Area: "))
households = float(input("Number of Households: "))
median_income = float(input("Median Income (in 10,000 dollars): "))

print("\nOcean Proximity Options:")
print("0 = <1H OCEAN (baseline)")
print("1 = INLAND")
print("2 = ISLAND")
print("3 = NEAR OCEAN")
print("4 = NEAR BAY")

op = int(input("Enter Ocean Proximity (0–4): "))

# Convert back to totals
total_rooms = rooms_per_house * households
total_bedrooms = bedrooms_per_house * households

# Engineered features
rooms_per_household = rooms_per_house
bedrooms_per_room = bedrooms_per_house / rooms_per_house if rooms_per_house else 0
population_per_household = population / households if households else 0

new_data = {
    "longitude": fixed_longitude,
    "latitude": fixed_latitude,
    "housing_median_age": housing_median_age,
    "total_rooms": total_rooms,
    "total_bedrooms": total_bedrooms,
    "population": population,
    "households": households,
    "median_income": median_income,
    "rooms_per_household": rooms_per_household,
    "bedrooms_per_room": bedrooms_per_room,
    "population_per_household": population_per_household,
}

input_df = pd.DataFrame([new_data])

# Add dummy columns
for col in X.columns:
    if col.startswith("ocean_proximity_") and col not in input_df.columns:
        input_df[col] = 0

op_map = {
    1: "ocean_proximity_INLAND",
    2: "ocean_proximity_ISLAND",
    3: "ocean_proximity_NEAR OCEAN",
    4: "ocean_proximity_NEAR BAY",
}
if op in op_map and op_map[op] in input_df.columns:
    input_df[op_map[op]] = 1

# Match training columns exactly
input_df = input_df.reindex(columns=X.columns, fill_value=0)

# Predict price
price = xgb.predict(input_df)[0]
print("\n----------------------------------------")
print("Predicted House Price:", price)
print("----------------------------------------")



----------------------------------------
XGBoost Model Accuracy (R² Score): 0.8504972643111829
----------------------------------------

Enter House Details for Prediction:



Housing Median Age:  10
Rooms per House:  3
Bedrooms per House:  2
Population in the Area:  5000
Number of Households:  865
Median Income (in 10,000 dollars):  2



Ocean Proximity Options:
0 = <1H OCEAN (baseline)
1 = INLAND
2 = ISLAND
3 = NEAR OCEAN
4 = NEAR BAY


Enter Ocean Proximity (0–4):  1



----------------------------------------
Predicted House Price: 197110.12
----------------------------------------
