---
title: "Practice Activity 7.1: Cross-Validation and Tuning"
format: 
  html:
    theme: lux
---

# 7.1
# Part 1

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
ames = pd.read_csv("/content/AmesHousing (1).csv")

# Define features and target
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]
y = ames['SalePrice']

# Split the data once for consistent test set across models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 1. Model 1: Using only size and number of rooms
ct_1 = ColumnTransformer(
    transformers=[
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
    ],
    remainder="drop"
)
model_1 = Pipeline([
    ("preprocessing", ct_1),
    ("linear_regression", LinearRegression())
])

# 2. Model 2: Using size, number of rooms, and building type
ct_2 = ColumnTransformer(
    transformers=[
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
    ],
    remainder="drop"
)
model_2 = Pipeline([
    ("preprocessing", ct_2),
    ("linear_regression", LinearRegression())
])

# 3. Model 3: Using size and building type, and their interaction
ct_3 = ColumnTransformer(
    transformers=[
        ("dummify", OneHotEncoder(sparse_output=False, drop='first'), ["Bldg Type"]),
        ("standardize", StandardScaler(), ["Gr Liv Area"])
    ],
    remainder="drop"
)
model_3 = Pipeline([
    ("preprocessing", ct_3),
    ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False)),
    ("linear_regression", LinearRegression())
])

# 4. Model 4: Using 5-degree polynomial on size, 5-degree polynomial on number of rooms, and building type
ct_4 = ColumnTransformer(
    transformers=[
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_size", PolynomialFeatures(degree=5, include_bias=False), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(degree=5, include_bias=False), ["TotRms AbvGrd"])
    ],
    remainder="drop"
)
model_4 = Pipeline([
    ("preprocessing", ct_4),
    ("linear_regression", LinearRegression())
])

# List of models
models = [model_1, model_2, model_3, model_4]
model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]
rmse_scores = {}

# Fit each model, make predictions, and calculate RMSE
for i, model in enumerate(models):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    rmse_scores[model_names[i]] = rmse
    print(f"{model_names[i]} RMSE: {rmse}")

# Find the model with the lowest RMSE
best_model = min(rmse_scores, key=rmse_scores.get)
print(f"\nBest model: {best_model} with RMSE of {rmse_scores[best_model]}")


Model 1 RMSE: 57988.68895994643
Model 2 RMSE: 56501.64843884793
Model 3 RMSE: 55999.430198952774
Model 4 RMSE: 56117.68770884771

Best model: Model 3 with RMSE of 55999.430198952774


# Part 2

In [4]:
from sklearn.model_selection import cross_val_score

# Define a helper function to calculate RMSE from cross_val_score
def rmse_cross_val(model, X, y):
    # Get negative MSE scores and convert to positive RMSE
    neg_mse_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-neg_mse_scores)  # Convert negative MSE to positive RMSE
    return rmse_scores.mean()

# Calculate cross-validated RMSE for each model
cross_val_rmse_scores = {}
for i, model in enumerate(models):
    rmse = rmse_cross_val(model, X, y)
    cross_val_rmse_scores[model_names[i]] = rmse
    print(f"{model_names[i]} Cross-validated RMSE: {rmse}")

# Identify the model with the lowest cross-validated RMSE
best_cv_model = min(cross_val_rmse_scores, key=cross_val_rmse_scores.get)
print(f"\nPreferred model based on cross-validated RMSE: {best_cv_model} with RMSE of {cross_val_rmse_scores[best_cv_model]}")


Model 1 Cross-validated RMSE: 55806.32634926364
Model 2 Cross-validated RMSE: 54140.66302092876
Model 3 Cross-validated RMSE: 53430.92197532814
Model 4 Cross-validated RMSE: 56303.18380514651

Preferred model based on cross-validated RMSE: Model 3 with RMSE of 53430.92197532814


# Part 3

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

# Load the dataset (assuming the CSV file path is "/content/AmesHousing (1).csv")
ames = pd.read_csv("/content/AmesHousing (1).csv")

# Define features and target
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]
y = ames['SalePrice']

# Column transformer setup
ct_poly = ColumnTransformer(
    transformers=[
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("polynomial", PolynomialFeatures(), ["Gr Liv Area", "TotRms AbvGrd"])
    ],
    remainder="drop"
)

# Pipeline with PolynomialFeatures and LinearRegression
lr_pipeline_poly = Pipeline([
    ("preprocessing", ct_poly),
    ("linear_regression", LinearRegression())
]).set_output(transform="pandas")

# Degree range for tuning
degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}  # Degrees 1 to 9

# Grid search with cross-validation
gscv = GridSearchCV(lr_pipeline_poly, degrees, cv=5, scoring='r2', n_jobs=-1)
gscv_fitted = gscv.fit(X, y)

# Extract cross-validated metrics
cv_results = pd.DataFrame({
    "degrees": np.arange(1, 10),
    "scores": gscv_fitted.cv_results_['mean_test_score']
})

# Display the best model and corresponding cross-validated score
best_model = gscv_fitted.best_estimator_
best_score = gscv_fitted.best_score_

print("Best Model:", best_model)
print("Best R-squared Score:", best_score)
print("\nCross-validated scores for each degree:\n", cv_results)


Best Model: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('dummify',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Bldg Type']),
                                                 ('polynomial',
                                                  PolynomialFeatures(degree=3),
                                                  ['Gr Liv Area',
                                                   'TotRms AbvGrd'])])),
                ('linear_regression', LinearRegression())])
Best R-squared Score: 0.5410026448115971

Cross-validated scores for each degree:
    degrees      scores
0        1    0.532882
1        2    0.531259
2        3    0.541003
3        4    0.530984
4        5    0.399898
5        6   -1.410547
6        7  -20.793747
7        8 -132.190776
8        9 -568.868517


# Question 1
The best model found by GridSearchCV is a polynomial regression model with a degree of 3 applied to Gr Liv Area and TotRms AbvGrd. The model includes:

- One-hot encoding for Bldg Type.
- Standard scaling for Gr Liv Area and TotRms AbvGrd.
- A polynomial transformation with a degree of 3.

# Question 2
Trying all model options can take a lot of time and processing power, especially with large datasets and high polynomial degrees, and it risks overfitting, which means the model may not work well on new data. To simplify, we could start with a smaller range of degrees (like 1 to 4) and only add complexity if needed. Another approach is using RandomizedSearchCV to sample from the range instead of testing every option, saving time while still finding good results.