In [1]:
# basic imports
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# small synthetic housing dataset
data = {
    "bedrooms": [2, 3, 3, 4, 3, 2, 4, 3, 2, 4],
    "bathrooms": [1, 2, 2, 3, 2, 1, 3, 2, 1, 3],
    "year_built": [1985, 1990, 2005, 2010, 1995, 1980, 2000, 2008, 1982, 2012],
    "lot_size": [4000, 5000, 6000, 7000, 5500, 4500, 6800, 6200, 4800, 7100],
    "price": [200000, 250000, 300000, 420000, 310000, 190000, 380000, 330000, 205000, 450000]
}

df = pd.DataFrame(data)

# show input (this is the screenshot you used for input_example.png)
df.head()


Unnamed: 0,bedrooms,bathrooms,year_built,lot_size,price
0,2,1,1985,4000,200000
1,3,2,1990,5000,250000
2,3,2,2005,6000,300000
3,4,3,2010,7000,420000
4,3,2,1995,5500,310000


In [2]:
# inputs and target
X = df[["bedrooms", "bathrooms", "year_built", "lot_size"]]
y = df["price"]

# baseline model
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# baseline predictions (used for output_example.png)
lin_preds = lin_reg.predict(X)
lin_preds


array([183324.09097089, 271626.81217005, 327025.59968443, 415328.32088359,
       294975.5377389 , 189270.14378639, 392066.69245337, 338105.35718731,
       201539.11185237, 421738.3332727 ])

In [3]:
# main model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
rf_preds = rf.predict(X)

# evaluation (simple metrics)
lin_mae = mean_absolute_error(y, lin_preds)
lin_rmse = np.sqrt(mean_squared_error(y, lin_preds))

rf_mae = mean_absolute_error(y, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y, rf_preds))

lin_mae, lin_rmse, rf_mae, rf_rmse


(13764.892299031839,
 np.float64(16571.90743957033),
 7780.0,
 np.float64(9649.81865114573))