<a href="https://www.kaggle.com/code/ollibolli/how-2-find-a-good-lambda?scriptVersionId=91026635" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Let's climb "the leaderboards"
i.e. improve our score from our last model

In [1]:
#import things needed for baseline and data 
#don't worry if there are some things you haven't seen yet, we'll go through it.
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression, RidgeCV, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error as mse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Predicting House Prices 💰🏡💰
### We'll work on the same model, but this time we'll do better. :happysteurer:

In [2]:
#repeat see last notebook for line by line comments explanations of old stuff
data = fetch_openml(name="house_prices", as_frame=True)
ames = data['data']
ames['Price'] = data['target']
#we'll keep a few more features this time to show some interactions
housing = ames[['LotArea', 'FullBath', 'Price', 'GrLivArea', 'YearBuilt']]

In [3]:
y = housing.Price
features = ['LotArea', 'FullBath', 'GrLivArea', 'YearBuilt']
X = housing[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
lambdas = [0.1, 3, 5, 10, 100]
n_folds = 10
kfold = KFold(n_splits = n_folds, random_state = 42)



In [5]:
scores = dict()
for lam in lambdas:
    cv_score = 0
    #compute the average cv loss over folds for specific lambda "lam"
    for fold, (train_indices, validation_indices) in enumerate(kfold.split(X_train)): 
        #instantiate a new model/predictor with lambda = lam (here called alpha)
        model = Ridge(alpha = lam)
        #train on set(train minus fold)
        model.fit(X_train.iloc[train_indices], y_train.iloc[train_indices])
        #predict on fold and compute score
        val_pred = model.predict(X_train.iloc[validation_indices])
        val_score = mse(val_pred, y_train.iloc[validation_indices])
        #add to average
        cv_score += val_score / n_folds
    print(f"lambda: {lam} cv score was {cv_score}")
    scores[lam] = cv_score

lambda: 0.1 cv score was 2107715151.95979
lambda: 3 cv score was 2107425723.2078266
lambda: 5 cv score was 2107234346.5931623
lambda: 10 cv score was 2106783079.3001177
lambda: 100 cv score was 2102351968.5125046


In [6]:
print(f" Lambda with lowest cv was : {min(scores, key=scores.get)}")
best_lambda = min(scores, key=scores.get)

 Lambda with lowest cv was : 100


In [7]:
#step 3 on the slides onwards
model = Ridge(alpha = best_lambda)
model.fit(X_train, y_train)
#etc.

Ridge(alpha=100)

# Now sklearn has built in functions
RidgeCV for Ridge specifically or cross_val_score that reduce above to just 1,2 lines of code.
I just did it this way to help bring the slides home.

In [8]:
#does all of the above
ridge = RidgeCV(alphas = lambdas) 
ridge.fit(X_train, y_train)
print(f"Best Lambda {ridge.alpha_}")

Best Lambda 100.0
