# Boston Housing data
This part of the tutorial loads data about Boston housing and median house prices. The goal is to predict the housing price in each district given a series of features.

Features include race, air quality and plot size.

In [6]:
# Load the data
import warnings
from sklearn.datasets import load_boston

warnings.filterwarnings('ignore')

boston = load_boston()
X_train = boston["data"][:100]
y_train = boston["target"][:100]
X_test = boston["data"][100:]
y_test = boston["target"][100:]

all_features = boston["feature_names"]


Next let's see some of the data. X is a matrix of all feature values. y is a vector of target data.

In [7]:
all_features = boston["feature_names"]
print str(all_features)
print 'y training size is %d' % len(y_train)
print 'X training size is %d x %d' % (len(X_train[0]), len(X_train))

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
y training size is 100
X training size is 13 x 100


# Linear regression

Let's try a couple of models. First a linear regression based on least squares. The variance should be close to 1 if the model is good. Don't be fooled by the R^2 on the training set. It's a line that does not describe the reality of taking on a real set of new random data.

In [8]:
from sklearn import linear_model
from sklearn.metrics import r2_score

clf_linear_simple = linear_model.LinearRegression()

clf_linear_simple.fit (X_train, y_train)
linear_r2 = r2_score(y_train, clf_linear_simple.predict(X_train))
print 'R^2 train is %f # bogus' % linear_r2
linear_test_r2 = r2_score(y_test, clf_linear_simple.predict(X_test))
print 'R^2 test is %f' % linear_test_r2


R^2 train is 0.880705 # bogus
R^2 test is -12.723166


# Random Forest
The random forest optimizes the results under the hood. This is the easy way. This is an overestimate of the R^2 again. This really is random. Try play twice and watch the score.

In [9]:
from sklearn.ensemble import RandomForestRegressor
rf_first = RandomForestRegressor(n_estimators=20, max_depth=4)
rf_first.fit(X_train, y_train)
linear_r2 = r2_score(y_train, clf.predict(X_train))
print('R^2 score: %.2f' % rf_first.score(X_test, y_test))

NameError: name 'clf' is not defined

In [5]:
# LET's plot it
import matplotlib.pyplot as plt
# ALLOW inline graphs
%matplotlib inline

# FOR THE student:
# Look at more graphs of features and find those that look to be the best
# fits

for feature_name in all_features[:2]:
    print 'Feature: %s' % feature_name
    #idx = list(all_features).index('LSTAT')
    idx = list(all_features).index(feature_name)
    plt.scatter([x[idx] for x in X_test], y_test,  color='black')

    plt.xticks(())
    plt.yticks(())

    plt.show()


#housing.main()

NameError: name 'all_features' is not defined

#Feature Selection
We try to find the best features using a series of random sets.

In [152]:
from collections import defaultdict
from sklearn.cross_validation import ShuffleSplit
import numpy as np


X = boston["data"]
Y = boston["target"]
 
rf = RandomForestRegressor()
scores = defaultdict(list)
 
#crossvalidate the scores on a number of different random splits of the data
for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
    X_train_shuf, X_test_shuf = X[train_idx], X[test_idx]
    Y_train_shuf, Y_test_shuf = Y[train_idx], Y[test_idx]
    r = clf.fit(X_train_shuf, Y_train_shuf)

    acc = r2_score(Y_test, clf.predict(X_test_shuf))
    for i in range(X.shape[1]):
        X_t = X_test_shuf.copy()
        np.random.shuffle(X_t[:, i])
        shuff_acc = r2_score(Y_test_shuf, clf.predict(X_t))

        scores[all_features[i]].append((acc-shuff_acc)/acc)

print "Features sorted by their score:"
sorted_features = sorted([(round(np.mean(score), 4), feat) for
              feat, score in scores.items()], reverse=True)
print sorted_features
good_features = [x[1] for x in sorted_features]

# FOR the student:
# Reduce this amount to remove features

FEATURE_COUNT = 13

print good_features[:FEATURE_COUNT]

# USE good features to train next model
X_selected_train = []
for line in X_train:
    x_out = []
    feature_num = 0
    for feature_name in all_features:
        if feature_name in good_features[:FEATURE_COUNT]:
            x_out.append(line[feature_num])
        feature_num += 1
    X_selected_train.append(x_out)

X_selected_test = []
for line in X_test:
    x_out = []
    feature_num = 0
    for feature_name in all_features:
        if feature_name in good_features[:FEATURE_COUNT]:
            x_out.append(line[feature_num])
        feature_num += 1
    X_selected_test.append(x_out)
    
clf_select = linear_model.LinearRegression()
#print 'Len1 %d' % len(X_selected)
clf_select.fit (X_selected_train, y_train)
print('R^2 selected train linear score: %.2f' % clf_select.score(X_selected_train, y_train))
print('R^2 selected test linear score: %.2f' % clf_select.score(X_selected_test, y_test))

    
# SCALE it
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

clf = linear_model.LinearRegression()

clf.fit(X_train_scaled, y_train)
X_test_scaled = scaler.fit_transform(X_test)

print('R^2 test scaled linear score: %.2f' % clf.score(X_test_scaled, y_test))

X_train_select_scaled = scaler.fit_transform(X_selected_train)
X_test_select_scaled = scaler.fit_transform(X_selected_test)


clf_select_scaled = linear_model.LinearRegression()
clf_select_scaled.fit(X_train_select_scaled, y_train)
print('R^2 test select scaled linear score: %.2f' % clf_select_scaled.score(X_test_select_scaled, y_test))


Features sorted by their score:
[(2.1105, 'AGE'), (2.1087, 'INDUS'), (2.0948, 'CHAS'), (2.0822, 'CRIM'), (2.0819, 'B'), (2.0611, 'ZN'), (1.9517, 'PTRATIO'), (1.9371, 'TAX'), (1.9355, 'NOX'), (1.8634, 'RAD'), (1.8517, 'RM'), (1.7591, 'DIS'), (1.5898, 'LSTAT')]
['AGE', 'INDUS', 'CHAS', 'CRIM', 'B', 'ZN', 'PTRATIO', 'TAX', 'NOX', 'RAD', 'RM', 'DIS', 'LSTAT']
R^2 selected train linear score: 0.88
R^2 selected test linear score: -12.72
R^2 test scaled linear score: 0.50
R^2 test select scaled linear score: 0.50
