In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [3]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [4]:
X_train, X_test, y_train, y_test = load_boston()

In [5]:
X_train.shape

(379, 13)

In [6]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
zip (y_test, clf.predict(X_test))

[(8.0999999999999996, 3.9826348946747281),
 (19.300000000000001, 21.619590785253756),
 (20.100000000000001, 18.649733645690162),
 (19.899999999999999, 19.273369062358615),
 (10.199999999999999, 6.9849506736718681),
 (20.199999999999999, 16.447664846068704),
 (10.9, 14.286754761464804),
 (20.899999999999999, 22.111503182590646),
 (21.199999999999999, 21.755481360843941),
 (13.300000000000001, 13.944505916202559),
 (23.100000000000001, 24.807692093519037),
 (10.4, 7.3610063728028923),
 (23.800000000000001, 24.829465163244443),
 (20.399999999999999, 22.858709520762378),
 (13.199999999999999, 9.0208360572490616),
 (21.600000000000001, 24.959213486627018),
 (21.800000000000001, 21.170500519259395),
 (22.5, 29.647241998781091),
 (21.199999999999999, 21.765933684621128),
 (23.899999999999999, 24.934566392422191),
 (24.699999999999999, 24.650605015732609),
 (22.800000000000001, 26.795863815409895),
 (19.100000000000001, 17.00927215069898),
 (19.0, 14.017439903194166),
 (8.3000000000000007, 13.

So now the fun begins. "Impliment scikit learn's r2 and mse methods to measure the performance of my linear regressor."

I'll start by implementing the r2 method

In [32]:
from sklearn import metrics
rtest = metrics.r2_score(y_test, clf.predict(X_test))
rtrain = metrics.r2_score(y_train, clf.predict(X_train))

print "rtest:",rtest
print "rtrain:", rtrain

rtest: 0.800987911443
rtrain: 0.718528478954


Now I'll implement the mse method

In [35]:
MSE = metrics.mean_squared_error(y_test, clf.predict(X_test))

print "Mean Squared Error:", MSE

Mean Squared Error: 13.9456727168


We can also calculate the RMSE as well, by taking the square root of the MSE

In [36]:
import numpy as np
RMSE = np.sqrt(MSE)

print "RMSE:", RMSE

RMSE: 3.73439054155


Next is: "Impliment either sklearn.linear_model.Ridge or sklearn.linear_model.Lasso."

I think Lasso looks a bit easier, so I'll try it. I'll set alpha to 0.01 to start, as per the Gradient Descent learner.

In [79]:
from sklearn import linear_model

lasso = linear_model.Lasso(alpha = .01)
lasso.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

Now I can output the test vs train scores

In [80]:
lassotest = lasso.score(X_test, y_test)
lassotrain = lasso.score(X_train, y_train)

print "Lasso test:", lassotest
print "Lasso train:", lassotrain

Lasso test: 0.801266863712
Lasso train: 0.718477882778
