From the Cleaned & Stratified Training Data
---

In [1]:
import pandas as pd
import numpy as np

In [2]:
strat_train_set = pd.read_csv("strat_train_set.csv")
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy().values

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.79,40.0,930.0,199.0,564.0,184.0,1.3281,NEAR BAY
1,-118.1,33.98,33.0,1927.0,482.0,1623.0,479.0,3.5268,<1H OCEAN
2,-117.08,32.77,31.0,1070.0,155.0,426.0,153.0,6.1628,NEAR OCEAN
3,-117.13,32.76,22.0,2623.0,732.0,1283.0,718.0,2.1563,NEAR OCEAN
4,-121.38,38.68,35.0,1643.0,298.0,831.0,305.0,4.0673,INLAND


Deal with Numeric and Categorical Data
---

Transformation Pipelines in SKLearn allow us to tranform our data in one swoop.

In [4]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer

### Allows for the selection of a dataframe by index names

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

- `Imputer` rips out empty values and replaces them with the median of that data set
- `StandardScaler` does the feature scaling:
    - Two types of feature scaling:
        - `MinMaxScaler`: Normalization between M and N (typically 0 and 1, some algorithms require this)
        - `StandardScaler`: Subtracts mean and divides by variance (less affected by outliers)
- `LabelBinarizer` known as _one-hot encoding_, creates a 2D array (number of categories by number of entries) w/ a single 1 in each row to represent the category.
- `FeatureUnion` allows us to lump the 2 pipelines together.

In [6]:
numeric_attributes = list(housing.drop("ocean_proximity", axis=1))
categorical_attributes = ["ocean_proximity"]

numeric_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_attributes)),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('selector', DataFrameSelector(categorical_attributes)),
    ('binarizer', LabelBinarizer())
])

full_pipeline = FeatureUnion(transformer_list=[
    ("numeric_pipeline", numeric_pipeline),
    ("categorical_pipeline", categorical_pipeline)
])

housing_prepared = full_pipeline.fit_transform(housing)

Training
---

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
type(housing_prepared), type(housing_labels)

(numpy.ndarray, numpy.ndarray)

In [10]:
#lin_reg = LinearRegression()
#lin_reg.fit(housing_prepared, housing_labels)

In [12]:
#from sklearn.metrics import mean_squared_error
#housing_predictions = lin_reg.predict(housing_prepared)
#lin_mse = mean_squared_error(housing_labels, housing_predictions)
#lin_rmse = np.sqrt(lin_mse)
#lin_rmse

- `LinearRegression` is strongly underfitting data
    - Try another model `DecisionTreeRegressor`

In [13]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

#### Cross Validation

In [14]:
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)

def display_cross_val_scores(scores):
    print("Scores: {}".format(scores))
    print("Mean: {}".format(scores.mean()))
    print("StdDev: {}".format(scores.std()))
    
display_cross_val_scores(tree_rmse_scores)    

Scores: [ 60237.91820888  58576.29723038  59397.36086309  61076.21750618
  57358.34985593  60457.30818861  59671.7579223   57816.66111934
  58229.36397361  59228.26049748]
Mean: 59204.9495365805
StdDev: 1143.5380140842085


Linear Cross Validation
---

This one doesn't work. I don't really know why.

In [15]:
# lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
# lin_rmse_scores = np.sqrt(-lin_scores)
# display_cross_val_scores(lin_rmse_scores)

In [16]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print(forest_rmse)

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

display_cross_val_scores(forest_rmse_scores)

19086.95264
Scores: [ 43371.87152382  44279.94700355  46118.9519665   45674.94064467
  42308.4005317   44915.97417323  42776.28403136  45188.1207885
  45209.10918035  45050.61793221]
Mean: 44489.421777589494
StdDev: 1206.1647822640973


Grid Search
---

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [2, 10, 30, 50, 70], 'max_features': [2, 4, 6, 8, 10]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_features': [2, 4, 6, 8, 10], 'n_estimators': [2, 10, 30, 50, 70]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [24]:
grid_search.best_params_

{'max_features': 10, 'n_estimators': 70}

In [25]:
for mean_score, params in zip(grid_search.cv_results_["mean_test_score"], grid_search.cv_results_["params"]):
    print(np.sqrt(-mean_score), params)

58953.081171 {'max_features': 2, 'n_estimators': 2}
47842.062994 {'max_features': 2, 'n_estimators': 10}
45432.7590561 {'max_features': 2, 'n_estimators': 30}
45025.1023988 {'max_features': 2, 'n_estimators': 50}
44836.4644038 {'max_features': 2, 'n_estimators': 70}
55778.4679571 {'max_features': 4, 'n_estimators': 2}
45190.2484422 {'max_features': 4, 'n_estimators': 10}
43255.0558091 {'max_features': 4, 'n_estimators': 30}
42963.2154486 {'max_features': 4, 'n_estimators': 50}
42848.0789651 {'max_features': 4, 'n_estimators': 70}
55221.5914941 {'max_features': 6, 'n_estimators': 2}
44998.4767804 {'max_features': 6, 'n_estimators': 10}
43130.6156573 {'max_features': 6, 'n_estimators': 30}
42464.1284392 {'max_features': 6, 'n_estimators': 50}
42340.9011952 {'max_features': 6, 'n_estimators': 70}
52283.7194267 {'max_features': 8, 'n_estimators': 2}
44577.4761436 {'max_features': 8, 'n_estimators': 10}
42969.1539343 {'max_features': 8, 'n_estimators': 30}
42487.5945495 {'max_features': 8, 

In [27]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([  1.12825052e-01,   1.01617089e-01,   4.42278828e-02,
         2.78638464e-02,   2.54687600e-02,   3.60848298e-02,
         2.44652673e-02,   4.39667299e-01,   1.04361114e-02,
         1.69703531e-01,   2.92715211e-05,   1.39393590e-03,
         6.21712359e-03])

In [34]:
sorted(zip(feature_importances, (numeric_attributes + list(categorical_pipeline.classes_))), reverse=True)

[(0.43966729904938739, 'median_income'),
 (0.16970353127143692, 'INLAND'),
 (0.11282505183281684, 'longitude'),
 (0.10161708909186129, 'latitude'),
 (0.04422788276332916, 'housing_median_age'),
 (0.03608482977884369, 'population'),
 (0.027863846431707327, 'total_rooms'),
 (0.025468760016909307, 'total_bedrooms'),
 (0.024465267341276652, 'households'),
 (0.010436111409453309, '<1H OCEAN'),
 (0.0062171235885097424, 'NEAR OCEAN'),
 (0.0013939359033850567, 'NEAR BAY'),
 (2.9271521083280293e-05, 'ISLAND')]

Run the Test Set
---

In [36]:
strat_test_set = pd.read_csv("strat_test_set.csv")

housing_test =  strat_test_set.drop("median_house_value", axis=1)
housing_labels_test = strat_test_set["median_house_value"].copy()

housing_test_prepared = full_pipeline.transform(housing_test)

final_model = grid_search.best_estimator_

final_predictions = final_model.predict(housing_test_prepared)

final_mse = mean_squared_error(housing_labels_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [37]:
print(final_rmse)

41544.5823924
