# Assignment: House Price Feature Engineering

## Task 1 — Extract House Age Feature
## Task 2 — Create Price per Square Foot Feature
## Task 3 — Encode Categorical Features
## Task 4 — Analyze Feature Importance Using Regression or Tree Model

In [23]:
import pandas as pd
# Load your dataset
df = pd.read_csv("Housing.csv")
# Show first 5 rows
df.head()
df.info
df.describe()
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [19]:
#Extract features like “house age” from year built
df["House_Age_Proxy"] = (
    (df["stories"] * 0.3) +
    (df["basement"].map({"yes": 0, "no": 1}) * 0.3) +
    (df["furnishingstatus"].map({"furnished": 0, "semi-furnished": 1, "unfurnished": 2}) * 0.4)
)
df[["stories", "basement", "furnishingstatus", "House_Age_Proxy"]].head()

Unnamed: 0,stories,basement,furnishingstatus,House_Age_Proxy
0,3,no,furnished,1.2
1,4,no,furnished,1.5
2,2,yes,semi-furnished,1.0
3,2,yes,furnished,0.6
4,2,yes,furnished,0.6


In [16]:
#Task 2 — Create “price per square foot” (you can do this)
df["Price_per_sqft"] = df["price"] / df["area"]
df[["price", "area", "Price_per_sqft"]].head()

Unnamed: 0,price,area,Price_per_sqft
0,13300000,7420,1792.45283
1,12250000,8960,1367.1875
2,12250000,9960,1229.919679
3,12215000,7500,1628.666667
4,11410000,7420,1537.735849


In [17]:
#Task 3 — Encode categorical features
df_encoded = pd.get_dummies(
    df,
    columns=["mainroad", "guestroom", "basement",
             "hotwaterheating", "airconditioning",
             "prefarea", "furnishingstatus"],
    drop_first=True
)
df_encoded.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,House_Age_Proxy,Price_per_sqft,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,1.2,1792.45283,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,1.5,1367.1875,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,1.0,1229.919679,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,0.6,1628.666667,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,0.6,1537.735849,True,True,True,False,True,False,False,False


In [20]:
#ask 4 — Analyze Feature Importance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

X = df_encoded.drop(columns=["price"])
y = df_encoded["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)
importance.head(10)

Unnamed: 0,Feature,Importance
0,area,0.516696
6,Price_per_sqft,0.423144
2,bathrooms,0.016755
11,airconditioning_yes,0.011108
4,parking,0.010189
1,bedrooms,0.006213
5,House_Age_Proxy,0.005268
12,prefarea_yes,0.002627
14,furnishingstatus_unfurnished,0.001898
3,stories,0.001805
