Once again consider four modeling options for house price:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer

In [24]:
lr = LinearRegression()

from sklearn.model_selection import cross_val_score

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

Model 1

In [25]:

ct = ColumnTransformer(
  [
    #("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores

array([0.49734232, 0.48672143, 0.40178127, 0.53395268, 0.60124607])

In [26]:
scores.mean()

0.504208752508862

Model 2

In [36]:
#dummy

from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

#lr_pipeline

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores
scores.mean()

0.5334561732637108

MODEL 3

In [37]:
# Step 1: Preprocess: Standardize Gr Liv Area and one-hot encode Bldg Type
ct_pre = ColumnTransformer(
    [
        ("standardize", StandardScaler(), ["Gr Liv Area"]),
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
).set_output(transform="pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_1Fam"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_2fmCon"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_Duplex"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_Twnhs"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_TwnhsE"]),
    ],
  remainder = "drop"
).set_output(transform = "pandas")

# Step 3: Build the pipeline
lr_pipeline = Pipeline(
    [
        ("preprocessing", ct_pre),  # Apply the standardization and one-hot encoding
        ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False)),
        ("linear_regression", LinearRegression())  # Fit a linear regression model
    ]
)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores


array([0.55489258, 0.5421879 , 0.43095771, 0.58264186, 0.61362154])

In [38]:
scores.mean()

0.5448603188171477

MODEL 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

In [34]:
#Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

ct_pre = ColumnTransformer(
    [
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
).set_output(transform="pandas")

ct_degree = ColumnTransformer(
    [
    ("degree", PolynomialFeatures(degree = 5), ["standardize__Gr Liv Area", "standardize__TotRms AbvGrd"]),
    ],
    remainder = "drop"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
    [
        ("preprocessing", ct_pre),  # Apply the standardization and one-hot encoding
        ("degree", ct_degree),
        ("linear_regression", LinearRegression())  # Fit a linear regression model
    ]
)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores


array([ 0.48950908,  0.45681703, -2.03006451,  0.54602216,  0.47540921])

In [35]:
scores.mean()

-0.012461403864299147

R^2:

Model 1: 0.504208752508862

Model 2: 0.5334561732637108

Model 3: 0.5448603188171477

Model 4: -0.012461403864299147

Model 3 has the highest R^2

Consider one hundred modeling options for house price:

House size, trying degrees 1 through 10
Number of rooms, trying degrees 1 through 10
Building Type
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [53]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_sqft", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial_rooms", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial_sqft__degree': np.arange(1, 11),
            'preprocessing__polynomial_rooms__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [61]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.02610183, 0.01961937, 0.01910205, 0.01769028, 0.01947575,
        0.01838012, 0.0194943 , 0.05170045, 0.08079143, 0.0254333 ,
        0.02973409, 0.01698742, 0.01874795, 0.0191319 , 0.01775451,
        0.01845202, 0.04181266, 0.04545574, 0.06373434, 0.04492116,
        0.02948103, 0.01721311, 0.02092957, 0.0200223 , 0.01872888,
        0.03421555, 0.03349247, 0.02082009, 0.04859381, 0.02524891,
        0.03266764, 0.03081112, 0.02905812, 0.09429417, 0.08621364,
        0.05726805, 0.04913716, 0.12071638, 0.07423506, 0.02201157,
        0.02267365, 0.01815424, 0.02214355, 0.02623796, 0.04684038,
        0.02285314, 0.041119  , 0.04511476, 0.02529078, 0.03439593,
        0.02195501, 0.01852069, 0.03871398, 0.06497474, 0.09563646,
        0.1534677 , 0.1256557 , 0.0715919 , 0.18901005, 0.1056242 ,
        0.05243945, 0.07458849, 0.12125907, 0.18711953, 0.16648598,
        0.15064607, 0.14142981, 0.05725875, 0.05720367, 0.09707842,
        0.1590744 , 0.09533873,

In [62]:
gscv_fitted.cv_results_['params']
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])
params_df

Unnamed: 0,preprocessing__polynomial_rooms__degree,preprocessing__polynomial_sqft__degree
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5
...,...,...
95,10,6
96,10,7
97,10,8
98,10,9


In [63]:
results_df = params_df.assign(score = gscv_fitted.cv_results_['mean_test_score'])
results_df.sort_values(by = 'score', ascending = False)


Unnamed: 0,preprocessing__polynomial_rooms__degree,preprocessing__polynomial_sqft__degree,score
2,1,3,0.557641
33,4,4,0.556932
12,2,3,0.556857
43,5,4,0.556414
22,3,3,0.554039
...,...,...,...
89,9,10,-16.188760
99,10,10,-16.188760
90,10,1,-184.221203
91,10,2,-189.473656
