In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
lr = LinearRegression()


In [3]:
ames = pd.read_csv("/Users/addierhee/Downloads/AmesHousing.csv")



Practice Activity:

Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [4]:
#using only the size and the number of rooms to predict house prices

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)


from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    #("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [5]:
lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = lr_pipeline_fitted.predict(X_test)
r2_score(y_test, y_preds)

0.5661498970163275

In [6]:
from sklearn.metrics import mean_squared_error
import math

rmse = math.sqrt(mean_squared_error(y_test, y_preds))
rmse

54917.61305790573

MODEL 2

In [7]:
#dummy

from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline2

In [8]:
lr_fitted2 = lr_pipeline2.fit(X_train, y_train)

In [9]:
y_preds = lr_fitted2.predict(X_test)

r2_score(y_test, y_preds)

0.5936156684153276

In [10]:
# prompt: root mean square error using y_test and y_preds
from sklearn.metrics import mean_squared_error
import math

rmse = math.sqrt(mean_squared_error(y_test, y_preds))
rmse

53150.857787949346

MODEL 3

In [11]:
# Step 1: Preprocess: Standardize Gr Liv Area and one-hot encode Bldg Type
ct_pre = ColumnTransformer(
    [
        ("standardize", StandardScaler(), ["Gr Liv Area"]),
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
).set_output(transform="pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_1Fam"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_2fmCon"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_Duplex"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_Twnhs"]),
    ("interaction", PolynomialFeatures(interaction_only = True), ["standarize__Gr Liv Area", "dummify_Bldg Type_TwnhsE"]),
    ],
  remainder = "drop"
).set_output(transform = "pandas")

# Step 3: Build the pipeline
lr_pipeline = Pipeline(
    [
        ("preprocessing", ct_pre),  # Apply the standardization and one-hot encoding
        ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False)),
        ("linear_regression", LinearRegression())  # Fit a linear regression model
    ]
)

# Apply the pipeline to the training data (assuming X_train is defined)
model3_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = model3_fitted.predict(X_test)

RMSE3 = math.sqrt(mean_squared_error(y_test, y_preds))
RMSE3



53032.58763516105

MODEL 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

In [12]:
#Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

ct_pre = ColumnTransformer(
    [
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
).set_output(transform="pandas")

ct_degree = ColumnTransformer(
    [
    ("degree", PolynomialFeatures(degree = 5), ["standardize__Gr Liv Area", "standardize__TotRms AbvGrd"]),
    ],
    remainder = "drop"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
    [
        ("preprocessing", ct_pre),  # Apply the standardization and one-hot encoding
        ("degree", ct_degree),
        ("linear_regression", LinearRegression())  # Fit a linear regression model
    ]
)

model4_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = model4_fitted.predict(X_test)

RMSE4 = math.sqrt(mean_squared_error(y_test, y_preds))
RMSE4



53641.60401679239

RMSE:

Model 1: 54917.613

Model 2: 53150.857

Model 3: 53032.587

Model 4: 53641.604

Model 3 has the lowest RMSE, making it the best predictive model.