In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [4]:
df = pd.read_csv('datasets\\raw\\housing.csv')

In [5]:
df["income_cat"] = pd.cut(df["median_income"],
                          bins=[0.,
                                df['median_income'].quantile(0.25),
                                df['median_income'].quantile(0.50),
                                df['median_income'].quantile(0.75),
                                df['median_income'].quantile(0.95),
                                np.inf],
                          labels=[1, 2, 3, 4, 5])

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["income_cat"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

strat_train_set.drop('income_cat', axis=1, inplace=True)
strat_test_set.drop('income_cat', axis=1, inplace=True)

In [7]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names] # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
5612,-118.27,33.79,39.0,1417.0,359.0,1450.0,367.0,2.8462,<1H OCEAN,3.861035,3.950954
1174,-121.65,39.53,23.0,1387.0,325.0,640.0,289.0,1.4833,INLAND,4.799308,2.214533
16983,-122.29,37.56,36.0,805.0,140.0,445.0,139.0,5.8221,NEAR BAY,5.791367,3.201439
12262,-116.95,33.78,24.0,3409.0,804.0,1939.0,739.0,1.7303,INLAND,4.612991,2.623816
2223,-119.82,36.82,28.0,2268.0,336.0,752.0,330.0,5.2809,INLAND,6.872727,2.278788


In [9]:
housing_num = housing.drop("ocean_proximity", axis=1)
housing_cat = housing[["ocean_proximity"]]

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

num_attribs = list(housing_num.columns)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [14]:
housing_prepared

array([[ 0.64715606, -0.86150647,  0.82749507, ...,  0.        ,
         0.        ,  0.        ],
       [-1.04020388,  1.82723421, -0.44497999, ...,  0.        ,
         0.        ,  0.        ],
       [-1.35970399,  0.90444342,  0.588906  , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.63560951, -0.67882199, -1.08121752, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41961026,  0.95128559,  0.66843569, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.51236695, -0.63197982, -1.47886598, ...,  0.        ,
         0.        ,  0.        ]])

#### Question

Try a Support Vector Machine regressor (sklearn.svm.SVR), with various hyperparameters such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
        {'kernel': ['linear'], 'C': [10.0, 100.0, 1000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 10.0, 100.0, 1000.0],
         'gamma': [0.01, 0.05, 0.1, 1.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_median_absolute_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV] END ..............................C=10.0, kernel=linear; total time=   7.8s
[CV] END ..............................C=10.0, kernel=linear; total time=   7.8s
[CV] END ..............................C=10.0, kernel=linear; total time=  12.4s
[CV] END ..............................C=10.0, kernel=linear; total time=  13.9s
[CV] END ..............................C=10.0, kernel=linear; total time=  13.4s
[CV] END .............................C=100.0, kernel=linear; total time=  14.5s
[CV] END .............................C=100.0, kernel=linear; total time=  16.3s
[CV] END .............................C=100.0, kernel=linear; total time=  10.9s
[CV] END .............................C=100.0, kernel=linear; total time=   7.5s
[CV] END .............................C=100.0, kernel=linear; total time=   9.2s
[CV] END ............................C=1000.0, kernel=linear; total time=  16.3s
[CV] END ............................C=1000.0, k

In [18]:
grid_search.best_params_

{'C': 1000.0, 'gamma': 0.1, 'kernel': 'rbf'}

#### Question

Try replacing GridSearchCV with RandomizedSearchCV.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

svr = SVR()

random_search = RandomizedSearchCV(estimator=svr, param_distributions=param_distributions,
                                   n_iter=10, cv=5, n_jobs=-1, verbose=2, random_state=42,
                                   scoring='neg_mean_squared_error')

random_search.fit(housing_prepared, housing_labels)

In [None]:
random_search.best_params_

#### Question 

Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [44]:
from sklearn.base import BaseEstimator, TransformerMixin

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k

    def identify_top_features(self):
        top_feature_indices = np.sort(np.argpartition(np.array(self.feature_importances), -self.k)[-self.k:])
        return list(top_feature_indices)

    def fit(self, x, y=None):
        self.features = self.identify_top_features()
        return self

    def transform(self, x):
        return x[:, self.features]

In [38]:
from sklearn.ensemble import RandomForestRegressor

random_forest_regressor = RandomForestRegressor(n_estimators=200, random_state=42)
random_forest_regressor.fit(housing_prepared, housing_labels)

In [45]:
k = 5

preprocessing_and_feature_selection_pipeline = Pipeline([
    ('preprocessing', full_pipeline),
    ('feature_selection', TopFeatureSelector(random_forest_regressor.feature_importances_, k))
])

preprocessing_and_feature_selection_pipeline.fit_transform(housing)

array([[ 0.64715606, -0.86150647, -0.53824339,  0.07336569,  0.        ],
       [-1.04020388,  1.82723421, -1.25617039, -0.07640998,  1.        ],
       [-1.35970399,  0.90444342,  1.02935436,  0.00871599,  0.        ],
       ...,
       [ 1.63560951, -0.67882199, -0.88812021, -0.0494909 ,  1.        ],
       [-1.41961026,  0.95128559, -0.01274337, -0.01450361,  0.        ],
       [ 0.51236695, -0.63197982,  3.69393867,  0.03654278,  0.        ]])

#### Question

Try creating a single pipeline that does the full data preparation plus the final prediction.

In [46]:
k =5

preprocess_and_predict_pipeline = Pipeline([
    ('preprocessing', full_pipeline),
    ('feature_selection', TopFeatureSelector(random_forest_regressor.feature_importances_, k)),
    ('forest', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [48]:
preprocess_and_predict_pipeline.fit(housing, housing_labels)

In [50]:
preprocess_and_predict_pipeline.predict(housing)

array([170772.  ,  64191.98, 287775.  , ...,  60639.  , 257311.  ,
       499833.99])

#### Question

Automatically explore some preparation options using GridSearchCV.

In [58]:
full_pipeline.named_transformers_["cat"].handle_unknown = 'ignore'

param_grid = [{
    'preprocessing__num__imputer__strategy': ['mean', 'median'],
    'feature_selection__k': list(range(1,5))
}]

grid_search_prep = GridSearchCV(preprocess_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)

grid_search_prep.fit(housing, housing_labels)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=mean; total time=   1.6s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=mean; total time=   1.5s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=mean; total time=   1.5s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=mean; total time=   1.5s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=mean; total time=   1.5s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=median; total time=   1.8s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=median; total time=   1.7s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=median; total time=   1.8s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strategy=median; total time=   4.4s
[CV] END feature_selection__k=1, preprocessing__num__imputer__strate

In [59]:
grid_search_prep.best_params_

{'feature_selection__k': 4, 'preprocessing__num__imputer__strategy': 'mean'}