## Predicting Housing Costs
This exercise is based on a tutorial in Packt's advanced machine learning course available at udemy.com

In [1]:
import os
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [3]:
# Read in and compile data
boston = load_boston()
X = boston.data
y = boston.target


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
# Scale data and add polynomial features
X_scaled = MinMaxScaler().fit_transform(X)
y = MinMaxScaler().fit_transform(y.reshape(-1, 1))
X_scaled_poly = PolynomialFeatures(
    degree = 2, 
    include_bias = False
).fit_transform(X_scaled)

In [9]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_poly,
    y,
    shuffle = True,
    random_state = 314
)

In [11]:
# Fit linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("test set score: {:.2f}".format(lr.score(X_test, y_test)))

In [14]:
# Fit ridge regression model
ridge = Ridge().fit(X_train, y_train)

print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
# Fit ridge regression with tuned alpha
ridge = Ridge(alpha  = 10.0).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Training set score: 0.79
Test set score: 0.64


In [19]:
# Performance suffered, retune alpha
ridge = Ridge(alpha = 0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Training set score: 0.93
Test set score: 0.77


In [None]:
# Fit lasso regression model
lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

In [25]:
# Default alpha value zeroed out all coefficients. Update alpha and max_iter
lasso = Lasso(alpha = 0.0001, max_iter = 100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

Training set score: 0.92
Test set score: 0.78
Number of features used: 46
