In [45]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [46]:
ames = pd.read_csv("AmesHousing.csv")
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


**13.2.5 Practice Activity - Pipelines**

In [47]:
X = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [48]:
# model 1 - size & number of rooms
ct1_std = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline1 = Pipeline(
  [("preprocessing", ct1_std),
  ("linear_regression", LinearRegression())]

).set_output(transform = "pandas")

In [49]:
# model 2 - size, number of rooms, & building type
ct2_dum = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline2 = Pipeline(
  [("preprocessing", ct2_dum),
  ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")

In [50]:
# model 3 - size & building type (w/interaction)
ct3_inter = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
        ("pass_size", "passthrough", ["Gr Liv Area"])
    ],
    remainder = "drop",
)

lr_pipeline3 = Pipeline(
    [
        ("preprocessing", ct3_inter),
        ("interact", PolynomialFeatures(degree = 2, include_bias = False, interaction_only = True)),
        ("linear_regression", LinearRegression()),
    ]
).set_output(transform = "pandas")

In [51]:
# model 4 - degree 5 polynomial
ct4_poly = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
        ("polynomial", PolynomialFeatures(degree = 5, include_bias = False), ["Gr Liv Area", "TotRms AbvGrd"])
    ],
    remainder = "drop",
)

lr_pipeline4 = Pipeline(
    [
        ("preprocessing", ct4_poly),
        ("lr", LinearRegression()),
    ]
).set_output(transform = "pandas")

In [52]:
models = {
    "Size & Rooms": lr_pipeline1,
    "Size, Rooms & Bldgtype": lr_pipeline2,
    "Size & Bldgtype (w/interaction)": lr_pipeline3,
    "Deg 5 Polynomial (Size, Rooms & Bldgtype)": lr_pipeline4,
}

In [53]:
rmse_all = pd.DataFrame()
for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    rmse_all[name] = [np.sqrt(np.mean(y_test - preds)**2)]

rmse_all

Unnamed: 0,Size & Rooms,"Size, Rooms & Bldgtype",Size & Bldgtype (w/interaction),"Deg 5 Polynomial (Size, Rooms & Bldgtype)"
0,886.854495,889.094725,1088.603214,1691.613767


Model 2 with size, rooms, and building type is the model that performed best because it has the lowest RMSE of 3497.50, which is notably lower than the other 3 models, all of which had RMSE of 3700 or higher.

**13.3.1 Practice Activity - Cross Validation**

In [54]:
from sklearn.model_selection import cross_val_score

X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]

scores = pd.DataFrame({
    "lr_pipeline1": cross_val_score(lr_pipeline1, X, y, cv=5, scoring='r2'),
    "lr_pipeline2": cross_val_score(lr_pipeline2, X, y, cv=5, scoring='r2'),
    "lr_pipeline3": cross_val_score(lr_pipeline3, X, y, cv=5, scoring='r2'),
    "lr_pipeline4": cross_val_score(lr_pipeline4, X, y, cv=5, scoring='r2'),
})
scores


Unnamed: 0,lr_pipeline1,lr_pipeline2,lr_pipeline3,lr_pipeline4
0,0.497342,0.531978,0.554927,0.499895
1,0.486721,0.532253,0.542188,0.496663
2,0.401781,0.428295,0.430958,0.081489
3,0.533953,0.565748,0.582642,-0.102651
4,0.601246,0.606138,0.613622,0.509659


In [55]:
scores.mean()

Unnamed: 0,0
lr_pipeline1,0.504209
lr_pipeline2,0.532882
lr_pipeline3,0.544867
lr_pipeline4,0.297011


Based on the cross validation scores, model 3, which uses size and building type including their interaction effect, is the best model due to it having the highest cross validation score of 0.545, which is only marginally higher than model 2. While this does not agree with my conclusion from earlier, models 2 and 3 have very similar cross validation scores of 0.533 and 0.545, respectively, so. I would say this is somewhat congruent with my previous congruent.

**13.3.3 Practice Activity - Tuning**

In [62]:
from sklearn.model_selection import GridSearchCV

size_pipeline = Pipeline([
    ("poly",  PolynomialFeatures(include_bias = False)),
    ("scale", StandardScaler())
])

rooms_pipeline = Pipeline([
    ("poly",  PolynomialFeatures(include_bias = False)),
    ("scale", StandardScaler())
])

bldg_pipeline = Pipeline([
    ("building", OneHotEncoder(sparse_output = False))
])

ct_tune = ColumnTransformer(
    [
        ("building", bldg_pipeline, ["Bldg Type"]),
        ("size", size_pipeline, ["Gr Liv Area"]),
        ("rooms", rooms_pipeline, ["TotRms AbvGrd"]),
    ],
    remainder = "drop"
)

pipeline_tune = Pipeline([
    ("preprocessing", ct_tune),
    ("linear_regression", LinearRegression())
]).set_output(transform = "pandas")

degrees = {
    "preprocessing__size__poly__degree":  np.arange(1, 11),
    "preprocessing__rooms__poly__degree": np.arange(1, 11)
}

gscv = GridSearchCV(pipeline_tune, degrees, cv = 5, scoring = "r2")

In [64]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.03274903, 0.02824445, 0.04108691, 0.04216623, 0.03987947,
        0.05242696, 0.05957179, 0.04390798, 0.02936554, 0.0285574 ,
        0.02713532, 0.02751279, 0.03010745, 0.02823906, 0.03409801,
        0.02862816, 0.02903972, 0.02869391, 0.03078527, 0.02942958,
        0.02835326, 0.02826324, 0.02961049, 0.02758808, 0.02918754,
        0.02854528, 0.02830043, 0.03111439, 0.02932582, 0.02966781,
        0.02956052, 0.05371594, 0.03584123, 0.09306459, 0.04681773,
        0.0735352 , 0.03358903, 0.05158477, 0.04579639, 0.03020601,
        0.02716413, 0.0272676 , 0.0427299 , 0.05799904, 0.02823634,
        0.03180609, 0.0287045 , 0.02969265, 0.03256712, 0.033601  ,
        0.03488016, 0.06497674, 0.05107732, 0.10211859, 0.04358149,
        0.05412207, 0.05122671, 0.07882152, 0.03109112, 0.03032742,
        0.02765169, 0.02912211, 0.02907867, 0.02977514, 0.02890368,
        0.02929945, 0.03725991, 0.03236032, 0.03008142, 0.04483156,
        0.05634246, 0.05285592,

In [66]:
gscv_fitted.cv_results_['mean_test_score']

array([ 5.32882439e-01,  5.37471938e-01,  5.57640613e-01,  5.49491829e-01,
        4.98820364e-01,  4.25486978e-01,  5.41367256e-01, -1.53645583e+00,
       -3.46805500e+01, -5.50168947e+02,  5.32382847e-01,  5.33567353e-01,
        5.56857221e-01,  5.50068971e-01,  5.03751463e-01,  4.38325330e-01,
        5.36559009e-01, -1.63950467e+00, -3.52005583e+01, -5.45226220e+02,
        5.35924169e-01,  5.34134134e-01,  5.54038993e-01,  5.50374031e-01,
        5.14483519e-01,  4.56694730e-01,  5.14777927e-01, -1.83971104e+00,
       -3.62979611e+01, -5.46836978e+02,  5.41528749e-01,  5.35417599e-01,
        5.50392399e-01,  5.56854942e-01,  5.35978602e-01,  5.26237547e-01,
        3.72326556e-01, -3.59948059e+00, -3.99601552e+01, -5.67701874e+02,
        5.41066183e-01,  5.30267305e-01,  5.46549151e-01,  5.55246689e-01,
        5.10664323e-01,  5.00762715e-01,  2.37332026e-01, -4.14234713e+00,
       -4.71072403e+01, -5.84457762e+02,  5.34862257e-01,  5.33313563e-01,
        5.45170737e-01,  

In [82]:
results_table = (
    pd.DataFrame(gscv_fitted.cv_results_)
)

results_table = results_table.sort_values("mean_test_score", ascending = False)
results_table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__rooms__poly__degree,param_preprocessing__size__poly__degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.041087,0.017031,0.025230,0.011032,1,3,"{'preprocessing__rooms__poly__degree': 1, 'pre...",0.545224,0.534884,0.512144,0.592701,0.603250,0.557641,0.034789,1
12,0.030107,0.003097,0.017689,0.002650,2,3,"{'preprocessing__rooms__poly__degree': 2, 'pre...",0.535670,0.538931,0.512221,0.595945,0.601520,0.556857,0.035455,2
33,0.093065,0.001416,0.034163,0.004664,4,4,"{'preprocessing__rooms__poly__degree': 4, 'pre...",0.538867,0.534870,0.499602,0.603065,0.607871,0.556855,0.042007,3
43,0.057999,0.032064,0.039612,0.028993,5,4,"{'preprocessing__rooms__poly__degree': 5, 'pre...",0.538854,0.533524,0.500126,0.596103,0.607627,0.555247,0.040477,4
22,0.029610,0.004315,0.020397,0.008721,3,3,"{'preprocessing__rooms__poly__degree': 3, 'pre...",0.535909,0.538952,0.504808,0.590477,0.600049,0.554039,0.035846,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,0.030206,0.001588,0.017071,0.000380,4,10,"{'preprocessing__rooms__poly__degree': 4, 'pre...",0.536415,0.530288,-823.520169,-2016.661436,0.605535,-567.701874,791.666933,96
59,0.030327,0.000717,0.016931,0.000389,6,10,"{'preprocessing__rooms__poly__degree': 6, 'pre...",0.534518,0.533770,-764.168386,-2104.781233,0.600105,-573.456245,820.950421,97
49,0.033601,0.002654,0.017796,0.001229,5,10,"{'preprocessing__rooms__poly__degree': 5, 'pre...",0.536056,0.529148,-859.391983,-2064.565288,0.603259,-584.457762,811.545558,98
89,0.030871,0.000176,0.017240,0.000214,9,10,"{'preprocessing__rooms__poly__degree': 9, 'pre...",0.537211,0.508535,-945.422400,-1997.551436,0.587481,-588.268122,794.196353,99


1. The model that performed best had used "preprocessing__size__poly__degree" to the 3rd degree and "preprocessing__rooms__poly__degree" to the 1st degree, producing a mean test score of 0.558 with the highest test rank.

2. When trying all possible model options, there are some downsides like the time, energy, and cost of running all possible model options. It definitely is a more time-consuming and costly process. You also run the risk of overfitting the model because you might pick a model with too many predictors. You might go about choosing a smaller number of tuning values to try by looking at the shape of the data on a plot, so you know which degrees to try rather than testing such a wide range of values.