In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv("/content/AmesHousing.csv")
df

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,37.0,7937,Pave,,IR1,Lvl,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,,8885,Pave,,IR1,Low,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,,Reg,Lvl,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,170000


## Exercise 13.2.5: Four Models - Test Set Comparison

Consider four possible models for predicting house prices:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all four models.

In [6]:
# general train/test split for all models
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
#Model 1: Size, rooms
ct1 = ColumnTransformer([
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
], remainder="drop")

Pipe1 = Pipeline([
    ("preprocessing", ct1),
    ("linear_regression model", LinearRegression())
])

Pipe1.fit(X_train, y_train)
pred1 = Pipe1.predict(X_test)
rmse1 = np.sqrt(mean_squared_error(y_test, pred1))
print(f"Model 1 RMSE: {rmse1:.2f}")

Model 1 RMSE: 59746.27


In [23]:
#Model 2: Size, rooms, building type
ct2 = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
], remainder="drop")

Pipe2 = Pipeline([
    ("preprocessing", ct2),
    ("linear_regression model", LinearRegression())
])

Pipe2.fit(X_train, y_train)
pred2 = Pipe2.predict(X_test)
rmse2 = np.sqrt(mean_squared_error(y_test, pred2))
print(f"Model 2 RMSE: {rmse2:.2f}")

Model 2 RMSE: 57617.17


In [24]:
#Model 3: Interaction between size and building type
xtrain3 = X_train[["Gr Liv Area", "Bldg Type"]]
xtest3 = X_test[["Gr Liv Area", "Bldg Type"]]

ct3 = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output=False, drop="first"), ["Bldg Type"]),
    ("no change", "passthrough", ["Gr Liv Area"])
])

Pipe3 = Pipeline([
    ("preprocessing", ct3),
    ("interaction", PolynomialFeatures(degree=2, interaction_only=True)),
    ("linear_regression model", LinearRegression())
])

Pipe3.fit(xtrain3, y_train)
pred3 = Pipe3.predict(xtest3)
rmse3 = np.sqrt(mean_squared_error(y_test, pred3))
print(f"Model 3 RMSE: {rmse3:.2f}")

Model 3 RMSE: 56601.72


In [25]:
#Model 4: 5th degree polynomial size, 5th degree polynomial rooms, building type
ct4 = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
    ("size", PolynomialFeatures(degree=5), ["Gr Liv Area"]),
    ("rooms", PolynomialFeatures(degree=5), ["TotRms AbvGrd"])
], remainder="drop")

Pipe4 = Pipeline([
    ("preprocessing", ct4),
    ("linear_regression model", LinearRegression())
])

Pipe4.fit(X_train, y_train)
pred4 = Pipe4.predict(X_test)
rmse4 = np.sqrt(mean_squared_error(y_test, pred4))
print(f"Model 4 RMSE: {rmse4:.2f}")

Model 4 RMSE: 60494.64


In [26]:
#Compare model RMSE
results = pd.DataFrame({
    "Model": ["Model 1: Size + Rooms",
              "Model 2: Size + Rooms + BldgType",
              "Model 3: Size x BldgType",
              "Model 4: 5th Degree Polynomial Size + 5th Degree Polynomial Rooms + BldgType"],
    "RMSE": [rmse1, rmse2, rmse3, rmse4]
})
results.sort_values("RMSE")

Unnamed: 0,Model,RMSE
2,Model 3: Size x BldgType,56601.722189
1,Model 2: Size + Rooms + BldgType,57617.17053
0,Model 1: Size + Rooms,59746.267248
3,Model 4: 5th Degree Polynomial Size + 5th Degr...,60494.635864


Model 3: Size x BldgType performed the best out of all four models as it has the lowest RMSE.

## Exercise 13.3.2: How Many Model Fitting Steps?

Recall that the model fitting step is when coefficients are computed for the linear regression.

How many different model fitting steps occurred when `gscv.fit(X, y)` was run?


In [29]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.02292047, 0.03045945, 0.05675802, 0.04881821, 0.06248155,
        0.02457032, 0.02458339, 0.03252006, 0.02010498]),
 'std_fit_time': array([0.00788022, 0.01595853, 0.01175856, 0.01734774, 0.01753259,
        0.00819294, 0.00562686, 0.01066602, 0.00085389]),
 'mean_score_time': array([0.01445775, 0.02057714, 0.03355985, 0.0262846 , 0.0321712 ,
        0.01214347, 0.018891  , 0.01682734, 0.01379933]),
 'std_score_time': array([0.00459186, 0.0095181 , 0.00674114, 0.00772198, 0.00634146,
        0.00094362, 0.01079385, 0.00551433, 0.00391857]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'params': [{'preprocessing__polynomial__degree': np.int64(1)},
  {'preprocessing__polynomial__degree': np.int64(2)},
  {'preprocessing__polynomial__degree': np.int64(3)},
  {'preprocessing__polynomial__

- number of unique degree values = 9
- number of folds = 5
- Total model fits = (# of unique degree values) * (# of folds) = # of different model fitting steps
- 9 * 5 = 45 different model fitting steps

Total there are 45 different model fitting steps. For each of the 9 polynomial degrees, the model is fit 5 times(# of k folds).

## Exercise 13.3.3: Grid Search with 100 Model Options

Consider one hundred modeling options for house price:

* House size, trying degrees 1 through 10
* Number of rooms, trying degrees 1 through 10
* Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [36]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"]),
    ("rooms", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression model", LinearRegression())]
)

degrees = {'preprocessing__rooms__degree': np.arange(1, 11),
           'preprocessing__polynomial__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring="neg_mean_squared_error")

gscv.fit(X, y)

In [37]:
print("Best parameters:", gscv.best_params_)
print(f"Best RMSE: {np.sqrt(-gscv.best_score_):.2f}")

Best parameters: {'preprocessing__polynomial__degree': np.int64(3), 'preprocessing__rooms__degree': np.int64(1)}
Best RMSE: 52896.32


Answer to Q1: The model that performed the best was the model that used degree 1 for rooms and degree 3 for size.

Answer to Q2: Given there are 5 folds, as well as 10x10 different combinations, this leaves us with 500 models total. Fitting this many models and evaluating each of them would take a large amount of time and resources, and dependent on the number of polynomials used can result in overfitting; However, having access to this many models improves our chances of finding an optimal model.

Given each individual predictor, it would be easier to test individual predictors and small interactions first, before combining the predictors that resulted in the best models. Rather than using large numbers of predictors from the start, identifying singular useful predictors can help narrow the number of models which are best.