## Chapter 2 -  End-to-End Machine Learning Project

## Exercises

In [1]:
import pickle

import pandas as pd
import numpy as np

import joblib
from scipy.stats import expon, reciprocal
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV

### Ingestion

In [2]:
df_features = pd.read_csv('housing_X_feateng_complete.csv')
df_result = pd.read_csv('housing_y_feateng_complete.csv')
df = df_features.join(df_result)

In [3]:
# For testing
# display(df.head())

### Train-Test Split

Using Stratified Sampling strategy

In [4]:
# Obtain the column to statify on
df['p1_median_income_category'] = np.ceil(df['median_income']/1.5)
df['p1_median_income_category'] = df['p1_median_income_category'].apply(lambda x: x if x<=5.0 else 5.0)

# Train Test Split - Stratified strategy
shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
df_splits = list(shuffle_split.split(df, df['p1_median_income_category']))[0]
df_strat_train = df.iloc[df_splits[0]].copy()
df_strat_test = df.iloc[df_splits[1]].copy()

# Remove column to stratify on
_ = df_strat_train.drop('p1_median_income_category', axis=1, inplace=True)
_ = df_strat_test.drop('p1_median_income_category', axis=1, inplace=True)

# X_train, X_test, y_train, y_test
X_train = df_strat_train.drop('median_house_value', axis=1).copy()
y_train = df_strat_train['median_house_value'].copy()
X_test = df_strat_test.drop('median_house_value', axis=1).copy()
y_test = df_strat_test['median_house_value'].copy()

In [5]:
# For testing
# display(X_train.describe())
# display(X_test.describe())

1 - Try a Support Vector Machine regressor (sklearn.svm.SVR), with various hyperparameters such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [6]:
param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

# Train
model1 = SVR()
grid_search = GridSearchCV(model1, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

Note: Training took about 41 minutes.

In [14]:
print(grid_search.best_estimator_)

SVR(C=30000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [15]:
final_model = grid_search.best_estimator_
y_predict_test = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_predict_test)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

68403.56222721853


In ex. 3, the RMSE score to beat is `48300.38643303764`. The best model using SVR is `68403.56222721853` so it looks like it is performing worse than the other models.

Linear kernal looks more preferred than RBF kernel. Also large value of C is preferred, which means good to try and run again to test on other large values of C.

2 - Try replacing GridSearchCV with RandomizedSearchCV.

In [10]:
svr_reg = SVR()
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }
rdm_search = RandomizedSearchCV(svr_reg, param_distributions=param_distribs, 
                                cv=5, n_iter=50, scoring='neg_mean_squared_error', 
                                verbose=2, random_state=0)
rdm_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf, total=   8.3s
[CV] C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.3s remaining:    0.0s


[CV]  C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf, total=   8.1s
[CV] C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf ......
[CV]  C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf, total=   8.0s
[CV] C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf ......
[CV]  C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf, total=   8.0s
[CV] C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf ......
[CV]  C=3135.3354391012117, gamma=1.2559307629658378, kernel=rbf, total=   8.0s
[CV] C=54052.08688038619, gamma=1.8789640641973517, kernel=rbf .......
[CV]  C=54052.08688038619, gamma=1.8789640641973517, kernel=rbf, total=  31.2s
[CV] C=54052.08688038619, gamma=1.8789640641973517, kernel=rbf .......
[CV]  C=54052.08688038619, gamma=1.8789640641973517, kernel=rbf, total=  25.6s
[CV] C=54052.08688038619, gamma=1.8789640641973517, kernel=rbf .......
[CV]  C=54052.08688038619, gamma=1.8789640641973517, kernel=rbf, total=  28.1s
[CV] C=54052.0868

[CV]  C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf, total=   7.9s
[CV] C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf ......
[CV]  C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf, total=   8.0s
[CV] C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf ......
[CV]  C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf, total=   8.0s
[CV] C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf ......
[CV]  C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf, total=   8.0s
[CV] C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf ......
[CV]  C=21651.72452650127, gamma=0.11194616148396284, kernel=rbf, total=   8.0s
[CV] C=228.69277486944384, gamma=1.488254838726735, kernel=rbf .......
[CV]  C=228.69277486944384, gamma=1.488254838726735, kernel=rbf, total=   8.1s
[CV] C=228.69277486944384, gamma=1.488254838726735, kernel=rbf .......
[CV]  C=228.69277486944384, gamma=1.488254838726735, kernel=rbf, total=   8.1s
[CV] C=228.69277

[CV]  C=137033.60093579383, gamma=1.0578264077757573, kernel=linear, total=  34.0s
[CV] C=88.36251475805093, gamma=1.0587427249064156, kernel=linear ....
[CV]  C=88.36251475805093, gamma=1.0587427249064156, kernel=linear, total=   5.6s
[CV] C=88.36251475805093, gamma=1.0587427249064156, kernel=linear ....
[CV]  C=88.36251475805093, gamma=1.0587427249064156, kernel=linear, total=   5.8s
[CV] C=88.36251475805093, gamma=1.0587427249064156, kernel=linear ....
[CV]  C=88.36251475805093, gamma=1.0587427249064156, kernel=linear, total=   5.7s
[CV] C=88.36251475805093, gamma=1.0587427249064156, kernel=linear ....
[CV]  C=88.36251475805093, gamma=1.0587427249064156, kernel=linear, total=   5.8s
[CV] C=88.36251475805093, gamma=1.0587427249064156, kernel=linear ....
[CV]  C=88.36251475805093, gamma=1.0587427249064156, kernel=linear, total=   6.4s
[CV] C=908.8753995450545, gamma=0.6437809962046902, kernel=linear ....
[CV]  C=908.8753995450545, gamma=0.6437809962046902, kernel=linear, total=   6.2s

[CV]  C=374.0784700353696, gamma=0.5348843862443556, kernel=rbf, total=   9.1s
[CV] C=147.33928098674735, gamma=0.8328435182783498, kernel=linear ...
[CV]  C=147.33928098674735, gamma=0.8328435182783498, kernel=linear, total=   6.3s
[CV] C=147.33928098674735, gamma=0.8328435182783498, kernel=linear ...
[CV]  C=147.33928098674735, gamma=0.8328435182783498, kernel=linear, total=   6.3s
[CV] C=147.33928098674735, gamma=0.8328435182783498, kernel=linear ...
[CV]  C=147.33928098674735, gamma=0.8328435182783498, kernel=linear, total=   6.6s
[CV] C=147.33928098674735, gamma=0.8328435182783498, kernel=linear ...
[CV]  C=147.33928098674735, gamma=0.8328435182783498, kernel=linear, total=   6.4s
[CV] C=147.33928098674735, gamma=0.8328435182783498, kernel=linear ...
[CV]  C=147.33928098674735, gamma=0.8328435182783498, kernel=linear, total=   6.3s
[CV] C=230.45596806057443, gamma=0.740758951587447, kernel=rbf .......
[CV]  C=230.45596806057443, gamma=0.740758951587447, kernel=rbf, total=   9.1s
[

[CV]  C=7189.758461461739, gamma=2.2215069574902953, kernel=rbf, total=  10.1s
[CV] C=7189.758461461739, gamma=2.2215069574902953, kernel=rbf .......
[CV]  C=7189.758461461739, gamma=2.2215069574902953, kernel=rbf, total=  10.1s
[CV] C=1228.942348968392, gamma=1.873459921263142, kernel=rbf ........
[CV]  C=1228.942348968392, gamma=1.873459921263142, kernel=rbf, total=   9.4s
[CV] C=1228.942348968392, gamma=1.873459921263142, kernel=rbf ........
[CV]  C=1228.942348968392, gamma=1.873459921263142, kernel=rbf, total=   9.4s
[CV] C=1228.942348968392, gamma=1.873459921263142, kernel=rbf ........
[CV]  C=1228.942348968392, gamma=1.873459921263142, kernel=rbf, total=   9.3s
[CV] C=1228.942348968392, gamma=1.873459921263142, kernel=rbf ........
[CV]  C=1228.942348968392, gamma=1.873459921263142, kernel=rbf, total=   9.6s
[CV] C=1228.942348968392, gamma=1.873459921263142, kernel=rbf ........
[CV]  C=1228.942348968392, gamma=1.873459921263142, kernel=rbf, total=   9.4s
[CV] C=58.322701751587935,

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 55.2min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='linear',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=50, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11ea76f28>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x11ea762b0>,
                                        'kernel': ['linear', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=0, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=2)

Note: training took 57 minutes

In [16]:
print(rdm_search.best_estimator_)
print()
print(rdm_search.best_params_)
# print()
# cvs = rdm_search.cv_results_
# for mean_score, params in zip(cvs['mean_test_score'], cvs['params']):
#     print('{:.1f}'.format(np.sqrt(-mean_score)), params)

SVR(C=161015.56330174036, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma=0.6323366026653814, kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

{'C': 161015.56330174036, 'gamma': 0.6323366026653814, 'kernel': 'rbf'}


In [21]:
final_model = rdm_search.best_estimator_
y_predict_test = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_predict_test)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

55732.58742879294


The best model earlier using grid search is `68403.56222721853`. This model has an RMSE of `55732.58742879294` which is better than the model obtained through grid search. Looks like for the RBF kernel to be preferred, the value of C is large

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)