In [63]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [64]:
df = pd.read_csv("houseprice.csv")

In [65]:
# Extract house age from built year
current_year = 2026
df["House_age"] = current_year - df["yr"]
df["House_age"]

0       35
1       35
2       35
3       35
4       35
        ..
3265     8
3266     8
3267     8
3268     8
3269     7
Name: House_age, Length: 3270, dtype: int64

In [66]:
#Price per square feature
df["price_per_sqft"] = df["price"]/df["area"]
df["price_per_sqft"]

0       1792.452830
1       1367.187500
2       1229.919679
3       1628.666667
4       1537.735849
           ...     
3265     606.666667
3266     736.312500
3267     483.425414
3268     601.374570
3269     454.545455
Name: price_per_sqft, Length: 3270, dtype: float64

In [67]:
numerical_val = ["yr", "House_age" , "period" , "index_nsa" , "index_sa"  , "bedrooms" , "bathrooms" ,"stories" ,"parking" ]
categorical_val = ["hpi_type" , "hpi_flavor" , "frequency" , "level" , "place_name" , "place_id", "mainroad", "guestroom" ,"basement" , "hotwaterheating" , "airconditioning" , "prefarea" , "furnishingstatus"]

x = df[numerical_val + categorical_val]
y = df["price"]

In [68]:
numeric_transformer = "passthrough"

categorical_transformer = OneHotEncoder(handle_unknown = "ignore")

preprocessor = ColumnTransformer(transformers = [
    ("num" , numeric_transformer , numerical_val) , ("cat" , categorical_transformer , categorical_val)])


In [69]:
linreg_pipeline = Pipeline(steps = [("preprocessor" , preprocessor) , ("model" , LinearRegression())])

x_train , x_test , y_train , y_test = train_test_split(x,y, test_size = 0.35 , random_state = 42 )
linreg_pipeline.fit(x_train , y_train)

y_pred = linreg_pipeline.predict(x_test)
print("Linear Regression R2 : " , r2_score(y_test , y_pred))

Linear Regression R2 :  0.601502903035545


In [70]:
features_names = (numerical_val + list(linreg_pipeline.named_steps["preprocessor"].named_transformers_["cat"].get_feature_names_out(categorical_val)))

coefficients = linreg_pipeline.named_steps["model"].coef_

coef_df = pd.DataFrame({"Feature":features_names , "Coefficient" : coefficients}).sort_values(by = "Coefficient" , key = abs , ascending = False)

print(coef_df.head(10))

                Feature   Coefficient
6             bathrooms  1.058050e+06
37   airconditioning_no -5.023281e+05
38  airconditioning_yes  5.023281e+05
36  hotwaterheating_yes  4.659322e+05
35   hotwaterheating_no -4.659322e+05
8               parking  4.084339e+05
7               stories  4.059968e+05
40         prefarea_yes  3.973662e+05
39          prefarea_no -3.973662e+05
29          mainroad_no -3.506664e+05


In [71]:
rf_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor),("model", RandomForestRegressor(n_estimators=200,random_state=42 ))])

rf_pipeline.fit(x_train, y_train)

y_pred_rf = rf_pipeline.predict(x_test)
print("Random Forest R²:", r2_score(y_test, y_pred_rf))


Random Forest R²: 0.8918829851714887


In [72]:
importances = rf_pipeline.named_steps["model"].feature_importances_

importance_df = pd.DataFrame({
    "Feature": features_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(importance_df.head(10))


                         Feature  Importance
6                      bathrooms    0.263532
8                        parking    0.079629
38           airconditioning_yes    0.077088
37            airconditioning_no    0.077062
7                        stories    0.057161
5                       bedrooms    0.047675
43  furnishingstatus_unfurnished    0.042793
39                   prefarea_no    0.031462
3                      index_nsa    0.028920
40                  prefarea_yes    0.027699
