In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
# set random state to keep outputs the same every time
RS = 42

In [3]:
X, y = fetch_openml(name="house_prices", as_frame=True, return_X_y=True)


In [4]:
f"{type(X)=}, {type(y)=}"

"type(X)=<class 'pandas.core.frame.DataFrame'>, type(y)=<class 'pandas.core.series.Series'>"

In [5]:
y = np.asarray(y, dtype=float)
X.shape, y.shape

((1460, 80), (1460,))

In [6]:
print("Samples:", len(y))
print("Unique y values:", len(np.unique(y)))
print("y dtype:", y.dtype)

Samples: 1460
Unique y values: 663
y dtype: float64


In [7]:
print(f"{X.head()=}")
print(f"{y[:5]=}")
print(f"{X.info()=}") 

X.head()=   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC Fence MiscFeature  \
0         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
1         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
2         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
3         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   
4         Lvl    AllPub  ...           0        0    NaN   NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  
0       0      2    2008 

In [8]:
# see all dtypes present
X.dtypes.value_counts()

object     43
int64      34
float64     3
Name: count, dtype: int64

In [9]:
numeric_cols = X.select_dtypes(include=["number"]).columns
object_cols = X.select_dtypes(include=["object"]).columns

print(f"{numeric_cols=}, {len(numeric_cols)=}")
print(f"{object_cols=}, {len(object_cols)=}")

numeric_cols=Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object'), len(numeric_cols)=37
object_cols=Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', '

In [10]:
# supress regression warning
import warnings

warnings.filterwarnings(
    "ignore",
    message="The number of unique classes is greater than 50%"
)


In [11]:
# build piplines
numeric_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ]
)

object_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [12]:
# compine piplines with transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_cols),
        ("obj", object_pipeline, object_cols),
    ],
    remainder="drop" # ensure no column is empty
)

In [None]:
# build model
model = Pipeline(
    steps=[
            ("preprocessor", preprocessor),
            ("model", RandomForestRegressor(
                n_estimators=300,
                oob_score=True,
                random_state=RS,
                n_jobs=-1
                )
            )
        ]
)

In [14]:
# split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=RS)

In [15]:
# train
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('obj', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
# check performance
train_score = model.score(X_train, y_train)
oob_score = model.named_steps["model"].oob_score_
print(f"{train_score=}, \n {oob_score=}")

train_score=0.9782236323461558, 
 oob_score=0.8398996569350135


# Evaluation
The model has high variance because the train score is significantly higher than the test score.
i will address high variance by first finding/increasing the optimal n_estimators(amount of trees)

In [None]:
# find optimal max depth
best_n_estimators = 250
best_score = -np.inf
for n in range(250, 600, 25):
    rf = RandomForestRegressor(
            n_estimators=n,
            oob_score=True,
            random_state=RS,
            n_jobs=-1,
            )
    temp_model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", rf)
    ])
    
    temp_model.fit(X_train, y_train)
    
    train_score = temp_model.score(X_train, y_train)
    oob_score = temp_model.named_steps["model"].oob_score_
    print(f"n_estimatos: {n}, trainscore: {train_score}, oob_score: {oob_score}")
    if oob_score > best_score:
        best_score = oob_score
        best_n_estimators = n
print(f"{best_score=}, {best_n_estimators=}")

n_estimatos: 250, trainscore: 0.9783282528966899, oob_score: 0.8401897955721923
n_estimatos: 275, trainscore: 0.9781239650926294, oob_score: 0.8397111739670096
n_estimatos: 300, trainscore: 0.9782236323461558, oob_score: 0.8398996569350135
n_estimatos: 325, trainscore: 0.9781372923676436, oob_score: 0.8386457192396948
n_estimatos: 350, trainscore: 0.9785340366170434, oob_score: 0.8391440411508733
n_estimatos: 375, trainscore: 0.9785362431045754, oob_score: 0.8394540994535707
n_estimatos: 400, trainscore: 0.9783392185933305, oob_score: 0.8397029987602772
n_estimatos: 425, trainscore: 0.9783528326610249, oob_score: 0.8403593869632424
n_estimatos: 450, trainscore: 0.9782609615089222, oob_score: 0.8405051397756301
n_estimatos: 475, trainscore: 0.9783030233435358, oob_score: 0.8406167609177138
n_estimatos: 500, trainscore: 0.978275389968979, oob_score: 0.8401091829643579
n_estimatos: 525, trainscore: 0.9783405437982904, oob_score: 0.8403628931928132
n_estimatos: 550, trainscore: 0.978583562

next, I'll find the best/decrease max_depth

In [None]:
# find optimal max depth
best_max_depth = None
best_score = -np.inf
for max_depth in range(5, 30):
    rf = RandomForestRegressor(
            n_estimators=best_n_estimators,
            oob_score=True,
            random_state=RS,
            n_jobs=-1,
            max_depth=max_depth
            )
    temp_model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", rf)
    ])
    
    temp_model.fit(X_train, y_train)
    
    train_score = temp_model.score(X_train, y_train)
    oob_score = temp_model.named_steps["model"].oob_score_
    print(f"max depth: {max_depth}, trainscore: {train_score}, oob_score: {oob_score}")
    if oob_score > best_score:
        best_score = oob_score
        best_max_depth = max_depth
print(f"{best_score=}, {best_max_depth=}")
    

max depth: 5, trainscore: 0.9102467037010223, oob_score: 0.8099059384492217
max depth: 6, trainscore: 0.9378477989586836, oob_score: 0.8242806322577692
max depth: 7, trainscore: 0.9549383005701646, oob_score: 0.830980117830614
max depth: 8, trainscore: 0.9654109651642333, oob_score: 0.8359727258341941
max depth: 9, trainscore: 0.9711470403977043, oob_score: 0.8369930861521652
max depth: 10, trainscore: 0.9745161904606948, oob_score: 0.8386628244985025
max depth: 11, trainscore: 0.9767331726791619, oob_score: 0.842354838436406
max depth: 12, trainscore: 0.977783322736982, oob_score: 0.8431513968860557
max depth: 13, trainscore: 0.9780144775431543, oob_score: 0.8415279693611296
max depth: 14, trainscore: 0.9782076758104041, oob_score: 0.8415045703450591
max depth: 15, trainscore: 0.9786891927856034, oob_score: 0.8438333994822782
max depth: 16, trainscore: 0.9780557132608503, oob_score: 0.8389612403813617
max depth: 17, trainscore: 0.9782295823960796, oob_score: 0.8402661717323097
max dep

next I'll find the best/increase min samples leaf

In [None]:
# find optimal min_samples_leaf
best_min_leafs = 1
best_score = -np.inf
for min_leafs in range(1, 20):
    rf = RandomForestRegressor(
            n_estimators=best_n_estimators,
            oob_score=True,
            random_state=RS,
            n_jobs=-1,
            max_depth=best_max_depth,
            min_samples_leaf=min_leafs,
            )
    temp_model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", rf)
    ])
    
    temp_model.fit(X_train, y_train)
    
    train_score = temp_model.score(X_train, y_train)
    oob_score = temp_model.named_steps["model"].oob_score_
    print(f"min samples leaf: {min_leafs}, trainscore: {train_score}, oob_score: {oob_score}")
    if oob_score > best_score:
        best_score = oob_score
        best_min_leafs = min_leafs
print(f"{best_score=}, {best_min_leafs=}")
    

min samples leaf: 1, trainscore: 0.9786891927856034, oob_score: 0.8438333994822782
min samples leaf: 2, trainscore: 0.968232260823916, oob_score: 0.8385664660826185
min samples leaf: 3, trainscore: 0.9552491191453847, oob_score: 0.8337646730999405
min samples leaf: 4, trainscore: 0.9412811513126347, oob_score: 0.8316846693043423
min samples leaf: 5, trainscore: 0.9296014101377568, oob_score: 0.8334326317097341
min samples leaf: 6, trainscore: 0.9194315639115727, oob_score: 0.8313977328771315
min samples leaf: 7, trainscore: 0.9101415743972346, oob_score: 0.828986828451356
min samples leaf: 8, trainscore: 0.9026577911310425, oob_score: 0.8280667238090028
min samples leaf: 9, trainscore: 0.8951552174442751, oob_score: 0.8263444672425846
min samples leaf: 10, trainscore: 0.8885846560271887, oob_score: 0.8245275764690909
min samples leaf: 11, trainscore: 0.8836584969519601, oob_score: 0.8234211045164552
min samples leaf: 12, trainscore: 0.8787017967893606, oob_score: 0.8219518465655448
min

next i'll decrease max features

In [39]:
# find optimal max_features
best_max_features = 20
best_score = -np.inf
for max_features in range(30, 100):
    rf = RandomForestRegressor(
            n_estimators=best_n_estimators,
            oob_score=True,
            random_state=RS,
            n_jobs=-1,
            max_depth=best_max_depth,
            min_samples_leaf=best_min_leafs,
            max_features=max_features
            )
    temp_model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", rf)
    ])
    
    temp_model.fit(X_train, y_train)
    
    train_score = temp_model.score(X_train, y_train)
    oob_score = temp_model.named_steps["model"].oob_score_
    print(f"max_features: {max_features}, trainscore: {train_score}, oob_score: {oob_score}")
    if oob_score > best_score:
        best_score = oob_score
        best_max_features = max_features
print(f"{best_score=}, {best_max_features=}")
    

max_features: 30, trainscore: 0.97818361208778, oob_score: 0.8443910339218257
max_features: 31, trainscore: 0.9781305294805769, oob_score: 0.843905638609927
max_features: 32, trainscore: 0.9779633131810203, oob_score: 0.8421295702009476
max_features: 33, trainscore: 0.9786179790388656, oob_score: 0.8468400482211241
max_features: 34, trainscore: 0.9785433457785458, oob_score: 0.8463623817962708
max_features: 35, trainscore: 0.9780238403851582, oob_score: 0.8418821816439768
max_features: 36, trainscore: 0.97871495283265, oob_score: 0.8471153085911216
max_features: 37, trainscore: 0.9789273873865958, oob_score: 0.8481180521423887
max_features: 38, trainscore: 0.9786132309664339, oob_score: 0.8460267563469575
max_features: 39, trainscore: 0.9787533186691758, oob_score: 0.8466436226975464
max_features: 40, trainscore: 0.978755536385934, oob_score: 0.8466002804841197
max_features: 41, trainscore: 0.9790574398065711, oob_score: 0.8483718573295655
max_features: 42, trainscore: 0.97898499418427

In [42]:
rf = RandomForestRegressor(
            n_estimators=best_n_estimators,
            oob_score=True,
            random_state=RS,
            n_jobs=-1,
            max_depth=best_max_depth,
            min_samples_leaf=best_min_leafs,
            max_features=best_max_features,
            )
best_model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", rf)
    ])

best_model.fit(X_train, y_train)
train_score = best_model.score(X_train, y_train)
oob_score = best_model.named_steps["model"].oob_score_
test_score = best_model.score(X_test, y_test)

print(f"{train_score=}, {oob_score=}, {test_score=}")

train_score=0.9797435543475406, oob_score=0.8523186507892776, test_score=0.8959272583685252
