# Apply linear regression on the load_boston dataset
# From scikit-learn to predict the house price

## Insight and Visualization on the data

In [1]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as pl
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor

In [2]:
%matplotlib inline

### #The features of each sample are stored in the data attribute of the dataset

In [3]:
city_data = datasets.load_boston()

### #Create our client's feature set to predict the house price

In [4]:
CLIENT_FEATURES = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]]

### #Initialize the housing prices and housing features

In [5]:
housing_prices = city_data.target
housing_features = city_data.data

### #Veiw the data

In [6]:
print("Boston data:", city_data.data)

Boston data: [[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]


In [7]:
print("Boston description:", city_data.DESCR)

Boston description: Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher 

### #The information about the class of each sample is stored in the *target* attribute of the dataset

In [8]:
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [9]:
print(boston.target.shape)

(506,)


In [10]:
print(boston.target)

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

### #The names of the classes are stored in the *feature_names* 

In [11]:
print("boston.keys(): \n{}".format(boston.keys()))

boston.keys(): 
dict_keys(['data', 'target', 'feature_names', 'DESCR'])


In [12]:
print(boston.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


### #Statistical Analysis and Data Exploration

In [16]:
n_samples, n_features = np.shape(housing_features)

In [18]:
print(n_features)
total_houses = n_samples

13


In [19]:
total_features = n_features

In [20]:
minimum_price = np.min(housing_prices)

In [21]:
maximum_price = np.max(housing_prices)

In [22]:
mean_price = np.mean(housing_prices)

In [23]:
median_price = np.median(housing_prices)

In [24]:
std_dev = np.std(housing_prices)

In [29]:
print("Boston Housing dataset statistics (in $1000's):\n")
print("Total number of houses:", total_houses)
print("Total number of features:", total_features)
print("Minimum house price:", minimum_price)
print("Maximum house price:", maximum_price)
print("Standard deviation of house price: {0:.3f}".format(std_dev))

Boston Housing dataset statistics (in $1000's):

Total number of houses: 506
Total number of features: 13
Minimum house price: 5.0
Maximum house price: 50.0
Standard deviation of house price: 9.188


## Imputation of Missing Values 

In [30]:
import numpy as np

In [31]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score


In [32]:
rng = np.random.RandomState(0)

In [33]:
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

### #Estimate the score on the entire dataset, with no missing values

In [34]:
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

Score with the entire dataset = 0.56


In [35]:
missing_rate = 0.75
n_missing_samples = int(np.floor(n_samples * missing_rate))
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

### #Estimate the score without the lines containing missing values

In [36]:
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

Score without the samples containing missing values = 0.48


### #Estimate the score after imputation of the missing values

In [37]:
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Score after imputation of the missing values = 0.57


## Selection of a Subset of Features

In [38]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit



In [39]:
boston = load_boston()

In [40]:
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]

In [41]:
    rf = RandomForestRegressor(n_estimators=20, max_depth=4)
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf, X[:, i:i + 1],
                                Y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), names[i]))

In [42]:
print(scores) 

[(0.16, 'CRIM'), (0.215, 'ZN'), (0.361, 'INDUS'), (-0.017, 'CHAS'), (0.382, 'NOX'), (0.491, 'RM'), (0.085, 'AGE'), (-0.013, 'DIS'), (0.152, 'RAD'), (0.323, 'TAX'), (0.385, 'PTRATIO'), (0.118, 'B'), (0.615, 'LSTAT')]


## Model Evaluation with Error Metics

In [43]:
from sklearn.cross_validation import train_test_split

In [44]:
def shuffle_split_data(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return X_train, y_train, X_test, y_test

In [45]:
try:
    X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices)
    print(shuffle_split_data)
except:
    print("Something went wrong with shuffling and splitting the data.")

<function shuffle_split_data at 0x10a1daf28>
