In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

df = pd.read_csv('house_prices.csv')
df.head()

Unnamed: 0,Id,SalePrice,GrLivArea,YearBuilt,Neighborhood,OverallCond
0,1,200000,1400,2000,NAmes,Good
1,2,185000,1300,1995,CollgCr,Fair
2,3,250000,1800,2010,NAmes,Good
3,4,275000,2000,2015,StoneBr,Excellent
4,5,225000,1600,2005,Edwards,Fair


In [2]:
# Feature Engineering
current_year = datetime.now().year
df['HouseAge'] = current_year - df['YearBuilt']
df['PricePerSqFt'] = df['SalePrice'] / df['GrLivArea']
df.head()

Unnamed: 0,Id,SalePrice,GrLivArea,YearBuilt,Neighborhood,OverallCond,HouseAge,PricePerSqFt
0,1,200000,1400,2000,NAmes,Good,26,142.857143
1,2,185000,1300,1995,CollgCr,Fair,31,142.307692
2,3,250000,1800,2010,NAmes,Good,16,138.888889
3,4,275000,2000,2015,StoneBr,Excellent,11,137.5
4,5,225000,1600,2005,Edwards,Fair,21,140.625


In [3]:
# Prepare data
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

categorical = ['Neighborhood', 'OverallCond']
numeric = X.drop(columns=categorical).columns

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', 'passthrough', numeric)
])

In [4]:
# Random Forest Model for Feature Importance
model = Pipeline([
    ('prep', preprocessor),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X, y)

feature_names = model.named_steps['prep'].get_feature_names_out()
importances = model.named_steps['rf'].feature_importances_

feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_imp.sort_values(by='Importance', ascending=False).head(10)

Unnamed: 0,Feature,Importance
10,num__YearBuilt,0.252907
9,num__GrLivArea,0.23697
12,num__PricePerSqFt,0.219063
11,num__HouseAge,0.16992
3,cat__Neighborhood_StoneBr,0.039771
8,num__Id,0.038789
4,cat__OverallCond_Excellent,0.018487
5,cat__OverallCond_Fair,0.010568
0,cat__Neighborhood_CollgCr,0.004894
6,cat__OverallCond_Good,0.004564
