# Select and Train a Model

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
data_prep = os.path.join("..","datasets","housing","data_prep","housing_prep.csv")
housing_prepared = pd.read_csv(data_prep)
housing_prepared

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.245020,0.504782,0.725490,0.039731,0.056218,0.019816,0.062920,0.152019,0.024826,0.001128,0.067348,1.0,0.0,0.0,0.0,0.0
1,0.241036,0.479277,0.254902,0.017119,0.017075,0.008492,0.020724,0.408374,0.034653,0.001622,0.043996,1.0,0.0,0.0,0.0,0.0
2,0.712151,0.024442,0.588235,0.049499,0.075548,0.026150,0.085885,0.162908,0.021983,0.001073,0.073633,0.0,0.0,0.0,0.0,1.0
3,0.472112,0.400638,0.470588,0.046828,0.059439,0.040836,0.065534,0.095447,0.029137,0.002771,0.059064,0.0,1.0,0.0,0.0,0.0
4,0.573705,0.179596,0.313725,0.167523,0.245329,0.124891,0.272778,0.174811,0.023976,0.001896,0.070047,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.619522,0.176408,0.882353,0.032177,0.037693,0.015976,0.038835,0.305603,0.034962,0.001639,0.053591,0.0,1.0,0.0,0.0,0.0
16508,0.676295,0.142402,0.764706,0.030269,0.047036,0.029401,0.047797,0.108157,0.024899,0.002724,0.075265,0.0,1.0,0.0,0.0,0.0
16509,0.791833,0.164718,0.156863,0.123340,0.140142,0.058718,0.142457,0.191197,0.037051,0.001650,0.051403,0.0,1.0,0.0,0.0,0.0
16510,0.631474,0.136026,0.588235,0.049702,0.060889,0.037921,0.066094,0.245693,0.031078,0.002508,0.056546,1.0,0.0,0.0,0.0,0.0


In [3]:
# Load stratified trainning set.
trainset_path = os.path.join("..","datasets","housing","train","housing_strat_train.csv")
strat_train_set = pd.read_csv(trainset_path)

# Predictors
housing = strat_train_set.drop("median_house_value", axis=1, inplace=False)
housing.drop("id", axis=1, inplace=True)


# Labels
housing_labels = strat_train_set["median_house_value"]

# Numeric columns DF
numeric_housing = housing.drop("ocean_proximity", axis=1)

# Categorical cloumns DF
housing_cat = housing[["ocean_proximity"]]

We now have a working Linear Regression Model!<br/>
Let's try it out with some instances from the training set:

In [4]:
# Custom Transformer
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
# Provided transformers
imputer = SimpleImputer(strategy="median")
cat_enconder = OneHotEncoder()
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)

# Pipelines
std_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attrib_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler', MinMaxScaler())
])


num_attribs = list(numeric_housing)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ('num', std_num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

# Data fitted to the pipeline
full_pipeline.fit(housing)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('attrib_adder',
                                                  CombinedAttributesAdder(add_bedrooms_per_room=True)),
                                                 ('std_scaler',
          

In [5]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
some_data = housing.iloc[:5]
some_lables = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_lables))

Predictions: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


## Evaluation
We can use scikit-learn's mean_squared_error() function to measure the RMSE of the training set.

In [9]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_mse

4709829587.97112

In [11]:
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68628.19819848922

The RMSE shows that the model is underfitting the training data. The meadian_house_value ranges between 120 000$ and 265 000$, therefore the model has a prediction error of 68 628$, which isn't a good score.<br/>

When this ahppens it can mean that the features don't provide enough information to make good predictions, or that the model isn't good enough.
We have 3 options to improve the result:<br/>
1- Select a more powerful model<br/>
2- Feed the algorithm with better features<br/>
3- Reduce the constraits of the model<br/>

Given that the model isn't regularized, we discard option 3. We could try to add more features or use a more powerful model.