# Week 4 Day 1 Assignment

Dataset: `hou_all.csv`

Tasks:
- Extract house age (if year built exists)
- Create price per square foot (if size exists)
- Encode categorical features (if any)
- Analyze feature importance using regression or a tree model

In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

In [None]:
# Load dataset (copy from week-3/day-2 if needed)
local_path = "hou_all.csv"
alt_path = os.path.join("..", "..", "..", "week-3", "day-2-assignment", "hou_all.csv")

if not os.path.exists(local_path) and os.path.exists(alt_path):
    import shutil
    shutil.copy(alt_path, local_path)

housing_df = pd.read_csv(local_path, header=None)

# Assign column names for Boston Housing (with optional extra column)
if housing_df.shape[1] == 15:
    housing_df.columns = [
        "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",
        "RAD","TAX","PTRATIO","B","LSTAT","MEDV","MEDV01"
    ]
elif housing_df.shape[1] == 14:
    housing_df.columns = [
        "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",
        "RAD","TAX","PTRATIO","B","LSTAT","MEDV"
    ]

print("Loaded:", local_path)
print(housing_df.shape)
print(list(housing_df.columns))
display(housing_df.head())

In [None]:
# Feature engineering (only if columns exist)
# House age from YearBuilt
if "YearBuilt" in housing_df.columns:
    housing_df["house_age"] = 2026 - housing_df["YearBuilt"]

# Price per square foot
price_col = "MEDV" if "MEDV" in housing_df.columns else housing_df.columns[-1]
sqft_candidates = ["GrLivArea", "sqft", "LotArea", "TotalBsmtSF"]
size_col = next((c for c in sqft_candidates if c in housing_df.columns), None)
if size_col is not None:
    housing_df["price_per_sqft"] = housing_df[price_col] / housing_df[size_col]

print("Added columns:", [c for c in ["house_age", "price_per_sqft"] if c in housing_df.columns])

In [None]:
# Simple models + feature importance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

# Target
target_col = price_col

# Features: drop target, keep numeric
X = housing_df.drop(columns=[target_col])
X = X.select_dtypes(include="number")
y = housing_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Linear Regression
lin = LinearRegression()
lin.fit(X_train, y_train)
lin_preds = lin.predict(X_test)
print("Linear R2:", r2_score(y_test, lin_preds))

# Decision Tree
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)
tree_preds = tree.predict(X_test)
print("Tree R2:", r2_score(y_test, tree_preds))

# Feature importance (tree)
importances = pd.Series(tree.feature_importances_, index=X.columns)
print("Top features (tree):")
print(importances.sort_values(ascending=False).head(10))