# The Ames (Iowa) housing dataset

Description of the data:
- 1460 samples, 79 features
- contains both numerical and categorical data


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

# Preprocessing the data

In [2]:
def load_data():
    
    # Read data
    df_train = pd.read_csv("./data/train.csv", index_col="Id")
    df_test = pd.read_csv("./data/test.csv", index_col="Id")
    
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    
    # Preprocessing
    df = clean(df)
    df = encode(df)
    df = impute(df)
    
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test

## Data cleaning

In [3]:
# Defining the cleaning function

def clean(df):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    
    # Replacing corrupt values in GarageYrBlt with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    
    # Renaming column names that begin with a number just in case they cause problems
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df

## Feature encoding
Encoding each feature with its correct type helps ensure each feature is treated appropriately by whatever functions we use, and makes it easier for us to apply transformations consistently.

The numeric features are already encoded correctly ('float' for continuous, 'int' for discrete), but the categoricals we'll need to do ourselves. Note in particular, that the 'MSSubClass' feature is read as an 'int' type, but is actually a (nominative) categorical.

In [4]:
# Defining the encoding function

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

# The ordinal (ordered) categorical features 
# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Add a None level for missing values
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}

def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

## Dealing with missing values

We'll impute 0 for missing numeric values and "None" for missing categorical values.

In [6]:
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

## Establishing a baseline score

In [7]:
# Loading the data
df_train, df_test = load_data()

In [8]:
# Looking at the values
display(df_train)
display(df_test)

# Displaying information about dtypes and missing values
display(df_train.info())
display(df_test.info())

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500.0
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500.0
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500.0
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000.0
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000.0
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000.0
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500.0
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125.0


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,6,2010,WD,Normal,0.0
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,Gar2,12500,6,2010,WD,Normal,0.0
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,,MnPrv,,0,3,2010,WD,Normal,0.0
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,6,2010,WD,Normal,0.0
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,0,,,,0,1,2010,WD,Normal,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2006,WD,Normal,0.0
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2006,WD,Abnorml,0.0
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,9,2006,WD,Abnorml,0.0
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,0.0


<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MSSubClass        1460 non-null   category
 1   MSZoning          1460 non-null   category
 2   LotFrontage       1460 non-null   float64 
 3   LotArea           1460 non-null   int64   
 4   Street            1460 non-null   category
 5   Alley             1460 non-null   category
 6   LotShape          1460 non-null   category
 7   LandContour       1460 non-null   category
 8   Utilities         1460 non-null   category
 9   LotConfig         1460 non-null   category
 10  LandSlope         1460 non-null   category
 11  Neighborhood      1460 non-null   category
 12  Condition1        1460 non-null   category
 13  Condition2        1460 non-null   category
 14  BldgType          1460 non-null   category
 15  HouseStyle        1460 non-null   category
 16  OverallQual       1460 non-nu

None

<class 'pandas.core.frame.DataFrame'>
Index: 1459 entries, 1461 to 2919
Data columns (total 80 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MSSubClass        1459 non-null   category
 1   MSZoning          1459 non-null   category
 2   LotFrontage       1459 non-null   float64 
 3   LotArea           1459 non-null   int64   
 4   Street            1459 non-null   category
 5   Alley             1459 non-null   category
 6   LotShape          1459 non-null   category
 7   LandContour       1459 non-null   category
 8   Utilities         1459 non-null   category
 9   LotConfig         1459 non-null   category
 10  LandSlope         1459 non-null   category
 11  Neighborhood      1459 non-null   category
 12  Condition1        1459 non-null   category
 13  Condition2        1459 non-null   category
 14  BldgType          1459 non-null   category
 15  HouseStyle        1459 non-null   category
 16  OverallQual       1459 non

None

## Notes on using MAE

Mean Absolute Error (MAE) measures the average size of the mistakes in a collection of predictions, without taking their direction into account. It takes the average absolute difference between the predicted values and the actual values.

In [9]:
# Using XGB to compute the cross-validated MAE score for our feature set

def score_dataset_XGB(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_absolute_error",
    )
    score = -1 * score.mean()
    return score

In [10]:
# using Random Forest for scoring

from sklearn.ensemble import RandomForestRegressor

def score_dataset_RF(X, y, model=RandomForestRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_absolute_error",
    )
    score = -1 * score.mean()
    return score

In [11]:
X = df_train.copy()
y = X.pop("SalePrice")

baseline_score_xgb = score_dataset_XGB(X, y)
print(f"Baseline score: {baseline_score_xgb:.5f} MAE")

Baseline score: 17803.72111 MAE


In [12]:
baseline_score_rf = score_dataset_RF(X, y)
print(f"Baseline score: {baseline_score_rf:.5f} MAE")

Baseline score: 16976.19388 MAE


## Determining feature utility

Usually, we want to focus on the top scoring features the most during feature development. Training on uninformative features can lead to overfitting.

In [13]:
# using permutation importance

from sklearn.inspection import permutation_importance

X = df_train.copy()
y = X.pop("SalePrice")

for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes

model = RandomForestRegressor()
model.fit(X, y)

# performing permutation importance
importance = permutation_importance(model, X, y, n_repeats=10, n_jobs=2, random_state=123)

# sorting features by average decrease in accuracy
sorted_idx = importance.importances_mean.argsort()

# organizing data
importance_dict = {
    'Feature': X.columns.values[sorted_idx],
    'Importance': importance.importances_mean[sorted_idx]
}

# creating and sorting a dataframe
importance_df = pd.DataFrame(importance_dict).sort_values('Importance', ascending=False)

In [14]:
importance_df.head(20)

Unnamed: 0,Feature,Importance
78,GrLivArea,0.187358
77,OverallQual,0.092713
76,GarageCars,0.086778
75,ExterQual,0.074984
74,TotalBsmtSF,0.035343
73,BsmtFinSF1,0.022903
72,FirstFlrSF,0.020295
71,SecondFlrSF,0.01237
70,LotArea,0.011733
69,GarageArea,0.010484


In [15]:
importance_df.tail(n=20)

Unnamed: 0,Feature,Importance
19,BsmtFinType2,0.000170117
18,ExterCond,0.0001538589
17,Fence,0.0001536972
16,PavedDrive,0.0001517837
15,BldgType,0.0001496029
14,KitchenAbvGr,0.0001432765
13,MasVnrType,0.0001083184
12,Electrical,9.512147e-05
11,PoolQC,8.685471e-05
10,Threeseasonporch,7.513387e-05


In [16]:
# using mutual information

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [17]:
X = df_train.copy()
y = X.pop("SalePrice")

mi_scores = make_mi_scores(X, y)
mi_scores.head(n=20)

OverallQual     0.571457
Neighborhood    0.526220
GrLivArea       0.430395
YearBuilt       0.407974
LotArea         0.394468
TotalBsmtSF     0.368536
GarageArea      0.361542
GarageCars      0.352312
BsmtQual        0.330803
KitchenQual     0.324679
ExterQual       0.324003
FirstFlrSF      0.282503
MSSubClass      0.281349
YearRemodAdd    0.274032
GarageYrBlt     0.272213
FullBath        0.263217
GarageFinish    0.262469
LotFrontage     0.218588
FireplaceQu     0.218161
GarageType      0.211553
Name: MI Scores, dtype: float64

In [18]:
importance = importance_df[importance_df['Importance'] > 0.001]
importance.head(20)

Unnamed: 0,Feature,Importance
78,GrLivArea,0.187358
77,OverallQual,0.092713
76,GarageCars,0.086778
75,ExterQual,0.074984
74,TotalBsmtSF,0.035343
73,BsmtFinSF1,0.022903
72,FirstFlrSF,0.020295
71,SecondFlrSF,0.01237
70,LotArea,0.011733
69,GarageArea,0.010484


In [19]:
X = df_train.copy()
y = X.pop("SalePrice")

X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [20]:
def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]

In [21]:
mae_score_xgb = score_dataset_XGB(X, y)
print(f"MAE score: {mae_score_xgb:.5f} MAE")

mae_score_rf = score_dataset_RF(X, y)
print(f"MAE score: {mae_score_rf:.5f} MAE")

MAE score: 17803.72111 MAE
MAE score: 17085.79041 MAE


## Creating the features

In [22]:
#  using label encoding for the categorical features

def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

In [23]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop("SalePrice")
    mi_scores = make_mi_scores(X, y)

    # Combine splits if test data is given
    # If we're creating features for test set predictions, we should use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("SalePrice")
        X = pd.concat([X, X_test])

    # Mutual Information
    X = drop_uninformative(X, mi_scores)

    X = label_encode(X)

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)
    
    if df_test is not None:
        return X, X_test
    else:
        return X

In [24]:
df_train, df_test = load_data()
X_train = create_features(df_train)
y_train = df_train.loc[:, "SalePrice"]

score_dataset_XGB(X_train, y_train)

17824.39293931935

In [25]:
score_dataset_RF(X_train, y_train)

16884.542308219177

## Tuning the model's hyperparameters

In [26]:
X_train = create_features(df_train)
y_train = df_train.loc[:, "SalePrice"]

xgb_params = dict(
    max_depth=6,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
)

In [27]:
xgb = XGBRegressor(**xgb_params)
score_dataset_XGB(X_train, y_train, xgb)

15346.29552921661

### Trying out an automatic hyperparameter tuner

In [28]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [29]:
import optuna

def objective_xgb(trial):
    xgb_params_opt = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    xgb_opt = XGBRegressor(**xgb_params_opt)
    return score_dataset_XGB(X_train, y_train, xgb_opt)

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=10)
xgb_params_opt = study_xgb.best_params

[I 2024-08-05 14:21:24,339] A new study created in memory with name: no-name-a5fe03fa-6e7a-4b58-9f18-1b6145767b0b
[I 2024-08-05 14:21:38,186] Trial 0 finished with value: 23388.173779965753 and parameters: {'max_depth': 8, 'learning_rate': 0.0005985740349441338, 'n_estimators': 2297, 'min_child_weight': 4, 'colsample_bytree': 0.3655579227154684, 'subsample': 0.28832252698273425, 'reg_alpha': 0.00018990563192324106, 'reg_lambda': 0.12345204184757759}. Best is trial 0 with value: 23388.173779965753.
[I 2024-08-05 14:21:48,414] Trial 1 finished with value: 16831.259385702055 and parameters: {'max_depth': 2, 'learning_rate': 0.09767642358858082, 'n_estimators': 5488, 'min_child_weight': 10, 'colsample_bytree': 0.7901476277625616, 'subsample': 0.646777088026704, 'reg_alpha': 0.0034995340278440885, 'reg_lambda': 0.00014470867565690134}. Best is trial 1 with value: 16831.259385702055.
[I 2024-08-05 14:22:09,175] Trial 2 finished with value: 39339.83991866438 and parameters: {'max_depth': 5, '

In [32]:
xgb_params_opt

{'max_depth': 10,
 'learning_rate': 0.0011884101273002391,
 'n_estimators': 7275,
 'min_child_weight': 3,
 'colsample_bytree': 0.5530548263384314,
 'subsample': 0.2869240003933981,
 'reg_alpha': 0.004623437295347169,
 'reg_lambda': 0.039885808456289}

In [30]:
def objective_rf(trial):
    rf_params_opt = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_impurity_decrease=trial.suggest_float("min_impurity_decrease", 0.2, 1.0),
        max_features=trial.suggest_float("max_features", 0.2, 1.0),
    )
    rf_opt = RandomForestRegressor(**rf_params_opt)
    return score_dataset_RF(X_train, y_train, rf_opt)

study_rf = optuna.create_study(direction="minimize")
study_rf.optimize(objective_rf, n_trials=10)
rf_params_opt = study_rf.best_params

[I 2024-08-05 14:26:50,934] A new study created in memory with name: no-name-fc44dc9e-ce14-48f5-abbc-a63a1a33909d
[I 2024-08-05 14:29:01,471] Trial 0 finished with value: 17294.683946734855 and parameters: {'max_depth': 8, 'n_estimators': 3724, 'min_samples_split': 3, 'min_impurity_decrease': 0.2830470284651484, 'max_features': 0.7897264435427184}. Best is trial 0 with value: 17294.683946734855.
[I 2024-08-05 14:29:49,518] Trial 1 finished with value: 17335.887496147916 and parameters: {'max_depth': 8, 'n_estimators': 3911, 'min_samples_split': 2, 'min_impurity_decrease': 0.6099031756341897, 'max_features': 0.2218439614055682}. Best is trial 0 with value: 17294.683946734855.
[I 2024-08-05 14:30:54,726] Trial 2 finished with value: 17107.99845126048 and parameters: {'max_depth': 9, 'n_estimators': 2324, 'min_samples_split': 9, 'min_impurity_decrease': 0.2989065493015848, 'max_features': 0.60417405946029}. Best is trial 2 with value: 17107.99845126048.
[I 2024-08-05 14:33:41,229] Trial 3

In [31]:
rf_params_opt

{'max_depth': 10,
 'n_estimators': 6033,
 'min_samples_split': 3,
 'min_impurity_decrease': 0.7133120041528948,
 'max_features': 0.7269856920298418}

# Training the final model

In [33]:
xgb_final = XGBRegressor(**xgb_params_opt)
score_dataset_XGB(X_train, y_train, xgb_final)

15067.102421339896

In [34]:
rf_final = RandomForestRegressor(
    max_depth=10,
    n_estimators=6033,
    min_samples_split=3,
    min_impurity_decrease=0.7133120041528948,
    max_features=0.7269856920298418)
score_dataset_RF(X_train, y_train, rf_final)

16803.876588535142

In [35]:
rf_final2 = RandomForestRegressor()
score_dataset_RF(X_train, y_train, rf_final2)

16863.41959589041

In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 74 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MSSubClass        1460 non-null   int8   
 1   MSZoning          1460 non-null   int8   
 2   LotFrontage       1460 non-null   float64
 3   LotArea           1460 non-null   int64  
 4   Street            1460 non-null   int8   
 5   Alley             1460 non-null   int8   
 6   LotShape          1460 non-null   int8   
 7   LandContour       1460 non-null   int8   
 8   Utilities         1460 non-null   int8   
 9   LotConfig         1460 non-null   int8   
 10  LandSlope         1460 non-null   int8   
 11  Neighborhood      1460 non-null   int8   
 12  Condition1        1460 non-null   int8   
 13  Condition2        1460 non-null   int8   
 14  BldgType          1460 non-null   int8   
 15  HouseStyle        1460 non-null   int8   
 16  OverallQual       1460 non-null   int8   
 17  