In [5]:
#What is cross-validation?
# In cross-validation, we run our modeling process on different subsets of the data to get multiple measures of model quality.

# For example, we could begin by dividing the data into 5 pieces, each 20% of the full dataset. In this case, we say that we have broken the data into 5 "folds".

#When should you use cross-validation?¶
#Cross-validation gives a more accurate measure of model quality, which is especially important if you are making a lot of modeling decisions. However, it can take longer to run, because it estimates multiple models (one for each fold).

#So, given these tradeoffs, when should you use each approach?

#For small datasets, where extra computational burden isn't a big deal, you should run cross-validation.
#For larger datasets, a single validation set is sufficient. Your code will run faster, and you may have enough data that there's little need to re-use some of it for holdout.

In [6]:
import pandas as pd

# Read the data
melbourne_file_path = '../Intro to Machine Learning/dataset/melb_data.csv'
# Load the data
data = pd.read_csv(melbourne_file_path)

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price

In [7]:
#Then, we define a pipeline that uses an imputer to fill in missing values and a random forest model to make predictions.

#While it's possible to do cross-validation without pipelines, it is quite difficult! Using a pipeline will make the code remarkably straightforward.

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50,
                                                              random_state=0))
                             ])

In [8]:
# We obtain the cross-validation scores with the cross_val_score() function from 
# scikit-learn. We set the number of folds with the cv parameter.

from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


In [9]:
#We typically want a single measure of model quality to compare alternative models. 
#So we take the average across experiments.
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
277707.3795913405
