In [1]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


fetch_housing_data()


import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

housing = strat_train_set.drop("median_house_value", axis=1)  # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)

from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {"bootstrap": [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]
forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [10]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [11]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [3]:
k = 5

In [12]:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices


array([ 0,  7,  8, 10, 13])

In [13]:
np.array(attributes)[top_k_feature_indices]

array(['longitude', 'median_income', 'income_cat', 'pop_per_hhold',
       'INLAND'], dtype='<U18')

In [14]:
sorted(zip(feature_importances, attributes), reverse=True)[:k]

[(0.28109934421321625, 'median_income'),
 (0.14802820617048226, 'income_cat'),
 (0.14108398990246232, 'INLAND'),
 (0.10844432520773334, 'pop_per_hhold'),
 (0.06387033161435096, 'longitude')]

In [15]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [16]:
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

In [17]:
housing_prepared_top_k_features[0:3]

array([[-1.15604281, -0.61493744, -0.95445595, -0.08649871,  0.        ],
       [-1.17602483,  1.33645936,  1.89030518, -0.03353391,  0.        ],
       [ 1.18684903, -0.5320456 , -0.95445595, -0.09240499,  0.        ]])

In [18]:
housing_prepared[0:3, top_k_feature_indices]

array([[-1.15604281, -0.61493744, -0.95445595, -0.08649871,  0.        ],
       [-1.17602483,  1.33645936,  1.89030518, -0.03353391,  0.        ],
       [ 1.18684903, -0.5320456 , -0.95445595, -0.09240499,  0.        ]])

In [19]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}))
])

In [20]:
prepare_select_and_predict_pipeline.fit(housing, housing_labels)

Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('attribs_adder',
                                                                   CombinedAttributesAdder()),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                    

In [21]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

Predictions:	 [199137.57052869 337892.61001075 174193.25762702  52098.86917316]
Labels:		 [286600.0, 340600.0, 196900.0, 46300.0]


In [22]:
param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, housing_labels)

Fitting 5 folds for each of 51 candidates, totalling 255 fits
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   9.7s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   9.8s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=  10.2s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   9.7s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=mean; total time=   9.5s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   9.2s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   9.5s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   9.5s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total time=   9.7s
[CV] END feature_selection__k=1, preparation__num__imputer__strategy=median; total t

[CV] END feature_selection__k=6, preparation__num__imputer__strategy=median; total time=  10.9s
[CV] END feature_selection__k=6, preparation__num__imputer__strategy=most_frequent; total time=  10.8s
[CV] END feature_selection__k=6, preparation__num__imputer__strategy=most_frequent; total time=  11.6s
[CV] END feature_selection__k=6, preparation__num__imputer__strategy=most_frequent; total time=  11.7s
[CV] END feature_selection__k=6, preparation__num__imputer__strategy=most_frequent; total time=  10.9s
[CV] END feature_selection__k=6, preparation__num__imputer__strategy=most_frequent; total time=  11.3s
[CV] END feature_selection__k=7, preparation__num__imputer__strategy=mean; total time=  11.8s
[CV] END feature_selection__k=7, preparation__num__imputer__strategy=mean; total time=  11.9s
[CV] END feature_selection__k=7, preparation__num__imputer__strategy=mean; total time=  11.8s
[CV] END feature_selection__k=7, preparation__num__imputer__strategy=mean; total time=  12.2s
[CV] END feat

KeyboardInterrupt: 

In [None]:
grid_search_prep.best_params_