In [74]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, svm, ensemble, metrics
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler

In [75]:
boston = load_boston()

In [76]:
boston.data.shape, boston.target.shape, boston.feature_names

((506, 13),
 (506,),
 array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'))

In [77]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [78]:
np.min(boston.target), np.max(boston.target), np.mean(boston.target)

(5.0, 50.0, 22.532806324110677)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                    test_size=0.25,
                                                    random_state=33)

In [80]:
scaler_X = StandardScaler().fit(X_train)
scaler_y = StandardScaler().fit(y_train.reshape(-1, 1))

X_train = scaler_X.transform(X_train)
X_test = scaler_X.transform(X_test)
y_train = scaler_y.transform(y_train.reshape(-1, 1)).reshape(1, -1)[0]
y_test = scaler_y.transform(y_test.reshape(-1, 1)).reshape(1, -1)[0]

In [81]:
X_train.shape, y_train.shape

((379, 13), (379,))

In [82]:
def train_and_evaluate(clf, X_train, y_train, k=5):
    clf.fit(X_train, y_train)
    print(f'Coefficient of determination on training set: {clf.score(X_train, y_train):.3f}')
    
    # Create k-fold cross validation iterator on k=5 folds.
    cv = KFold(k, shuffle=True, random_state=33)
    
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print(f'Average coefficient of determination using {k}-fold cross validation: {np.mean(scores):.3f}')

## With Linear Regression

In [83]:
clf_sgd = linear_model.SGDRegressor(loss='squared_loss',
                                    penalty=None,
                                    random_state=42)

In [84]:
train_and_evaluate(clf_sgd, X_train, y_train)

Coefficient of determination on training set: 0.750
Average coefficient of determination using 5-fold cross validation: 0.710


In [85]:
clf_sgd.coef_

array([-0.10270486,  0.09745246, -0.03757737,  0.0981797 , -0.12500087,
        0.34856003, -0.02949558, -0.27259783,  0.13212947, -0.08500994,
       -0.19865895,  0.0552727 , -0.40025048])

In [86]:
# The penalty is added to avoid overfitting. 
# L2 norm -- the squared sum of coefficients.
# L1 norm -- the sum of the absolute value of the coefficients.
clf_sgd_1 = linear_model.SGDRegressor(loss='squared_loss',
                                    penalty='l2',
                                    random_state=42)
train_and_evaluate(clf_sgd_1, X_train, y_train)

Coefficient of determination on training set: 0.750
Average coefficient of determination using 5-fold cross validation: 0.710


In [87]:
# ^ There are no improvements obtained.

## With Support Vector Regressor

In [88]:
# Using support vector machine for regression.
clf_svr = svm.SVR(kernel='linear')
train_and_evaluate(clf_svr, X_train, y_train)

Coefficient of determination on training set: 0.718
Average coefficient of determination using 5-fold cross validation: 0.709


In [89]:
# Using support vector machine for regression.
clf_svr_poly = svm.SVR(kernel='poly', gamma='auto')
train_and_evaluate(clf_svr_poly, X_train, y_train)

Coefficient of determination on training set: 0.904
Average coefficient of determination using 5-fold cross validation: 0.776


In [90]:
# Kernel set to Radial Basis Function (the default kernel set).
clf_svr_poly = svm.SVR(kernel='rbf', gamma='auto')
train_and_evaluate(clf_svr_poly, X_train, y_train)

Coefficient of determination on training set: 0.900
Average coefficient of determination using 5-fold cross validation: 0.834


## With Random Forest

In [91]:
clf_et = ensemble.ExtraTreesRegressor(n_estimators=10,
                                      random_state=42)

In [92]:
train_and_evaluate(clf_et, X_train, y_train)

Coefficient of determination on training set: 1.000
Average coefficient of determination using 5-fold cross validation: 0.861


In [93]:
sorted(zip(clf_et.feature_importances_, boston.feature_names), 
       key=lambda t: t[0], 
       reverse=True)

[(0.35885071010150504, 'RM'),
 (0.2890206406589639, 'LSTAT'),
 (0.10444180644187656, 'PTRATIO'),
 (0.04696870667466452, 'TAX'),
 (0.04057802602808487, 'DIS'),
 (0.03010238432972301, 'INDUS'),
 (0.029523243152679852, 'NOX'),
 (0.023416634477817246, 'CHAS'),
 (0.021335225479437083, 'CRIM'),
 (0.018985896038400448, 'RAD'),
 (0.01673347719805981, 'AGE'),
 (0.01558540633107056, 'B'),
 (0.004457843087717255, 'ZN')]

In [94]:
def measure_performance(X, y, clf, 
                        show_accuracy=True,
                        show_classification_report=True,
                        show_confusion_matrix=True,
                        show_r2_score=False):
    y_pred = clf.predict(X)
    
    if show_accuracy:
        print(f'Accuracy: {metrics.accuracy_score(y, y_pred):.3f}')
        
    if show_classification_report:
        print('Classification report:')
        print(metrics.classification_report(y, y_pred))
    
    if show_confusion_matrix:
        print('Confusion matrix')
        print(metrics.confusion_matrix(y, y_pred))
        
    if show_r2_score:
        print(f'Coefficient of determination: {metrics.r2_score(y, y_pred):.3f}')

In [99]:
measure_performance(X_test, y_test, clf_et, 
                    show_accuracy=False,
                    show_classification_report=False,
                    show_confusion_matrix=False,
                    show_r2_score=True)

Coefficient of determination: 0.803
