# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: True
Cloning into 'DataScienceSS20'...
remote: Enumerating objects: 586, done.[K
remote: Total 586 (delta 0), reused 0 (delta 0), pack-reused 586[K
Receiving objects: 100% (586/586), 130.66 MiB | 23.00 MiB/s, done.
Resolving deltas: 100% (240/240), done.
Checking out files: 100% (219/219), done.


In [4]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv(path+'/DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [5]:
# install 
!pip install scikit-optimize

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/5c/87/310b52debfbc0cb79764e5770fa3f5c18f6f0754809ea9e2fc185e1b67d3/scikit_optimize-0.7.4-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 18.4MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.9MB/s eta 0:00:01[K     |████████████▎                   | 30kB 2.5MB/s eta 0:00:01[K     |████████████████▎               | 40kB 2.0MB/s eta 0:00:01[K     |████████████████████▍           | 51kB 2.4MB/s eta 0:00:01[K     |████████████████████████▌       | 61kB 2.7MB/s eta 0:00:01[K     |████████████████████████████▌   | 71kB 3.0MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.6MB/s 
[?25hCollecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6ddfa9c66f6310274fb75d42/pyaml-20.4.0-py2.py3-none-any.whl
Installing collected packages: pyaml, scikit-optimize
Successfully installed 

### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [36]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [13]:
# simple model only using Random Forest

X, y = load_iris(True)
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,random_state=0)

X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

opt = BayesSearchCV(
    SVC(),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        'degree': Integer(1,8),
        'kernel': Categorical(['linear', 'poly', 'rbf']),
     },
     n_iter=32,
     random_state=0
     )

opt.fit(X_train,y_train)



BayesSearchCV(cv=None, error_score='raise',
              estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                            class_weight=None, coef0=0.0,
                            decision_function_shape='ovr', degree=3,
                            gamma='scale', kernel='rbf', max_iter=-1,
                            probability=False, random_state=None,
                            shrinking=True, tol=0.001, verbose=False),
              fit_params=None, iid=True, n_iter=32, n_jobs=1, n_points=1,
              optimizer_kwargs=No...
              refit=True, return_train_score=False, scoring=None,
              search_spaces={'C': Real(low=1e-06, high=1000000.0, prior='log-uniform', transform='identity'),
                             'degree': Integer(low=1, high=8, prior='uniform', transform='identity'),
                             'gamma': Real(low=1e-06, high=10.0, prior='log-uniform', transform='identity'),
                             'kernel': Categorical(cate

In [15]:
opt.best_score_

0.9821428571428571

In [16]:
opt.best_params_

OrderedDict([('C', 1.3361910455737007),
             ('degree', 5),
             ('gamma', 0.11283439533114079),
             ('kernel', 'linear')])

In [38]:
pipe = Pipeline([
    ('model', SVR()) #just put one model as placeholder
])

In [46]:
# Linear Regression

from sklearn.linear_model import LinearRegression
lin_regr = {
  'model': Categorical([LinearRegression()]),
}


In [66]:
# Random Forest Regression

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
#X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
rf_regr = {
    'model': Categorical([RandomForestRegressor()]),
    'model__max_depth':2, 
    'model__random_state':0,
    'model__n_estimators': Integer(10, 100, 'log-uniform'),
}

In [67]:
# SVM Regression

from sklearn.preprocessing import StandardScaler
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y_train = rng.randn(n_samples)
X_train = rng.randn(n_samples, n_features)
svm_regr = {
    'model':Categorical([SVR()]),
    'model__C':1.0, 
    'model__epsilon':0.2,
    'model': Categorical([SVR()]),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
    'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
}

In [69]:
# model comparison
opt = BayesSearchCV(
    pipe,
    [(svm_regr, 10), (rf_regr, 10), (lin_regr,1)], 
    cv=5,
    n_iter=32, 
      random_state=0,
    #scoring='neg_mean_squared_error'
    )


ValueError: ignored