In [2]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:
dataset = datasets.load_diabetes()

In [55]:
dataset

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ..., 
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([ 151.,   75.,  141.,  206.,  135.,   97.,  138.,   63.,  110.,
         310.,  101.,   69.,  179.,  185.,  118.,  171.,  166.,  144.,
          97.,  168.,   68.,   49.,   68.,  245.,  184.,  202.,  137.,
          85.,  131.,  283.,  129.,   59.,  341.,   87.,   65.,  102.,
         265.,  276.,  252.,   90.,  100.,   55.,   61.,   92.,  259.,

In [5]:
dataset['data'].shape

(442, 10)

In [6]:
alphas = np.array([100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0])

In [7]:
alphas

array([  1.00000000e+02,   1.00000000e+01,   1.00000000e+00,
         1.00000000e-01,   1.00000000e-02,   1.00000000e-03,
         1.00000000e-04,   0.00000000e+00])

In [14]:
model = Ridge()  ## instantiate model as object
model

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [53]:
grid = GridSearchCV(estimator=model,
                   param_grid={'alpha':alphas,
                               'fit_intercept': [True, False], ## can put other parameters into this dictionary
                               'normalize': [True,False]},
                    verbose=10   ## gives more detail in output
                    )



In [54]:
grid.fit(dataset.data, dataset.target)
print(grid)    ## tells what all of the parameters of the grid are

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] normalize=True, alpha=100.0, fit_intercept=True .................
[CV]  normalize=True, alpha=100.0, fit_intercept=True, score=0.020139 -   0.0s
[CV] normalize=True, alpha=100.0, fit_intercept=True .................
[CV]  normalize=True, alpha=100.0, fit_intercept=True, score=0.026828 -   0.0s
[CV] normalize=True, alpha=100.0, fit_intercept=True .................
[CV]  normalize=True, alpha=100.0, fit_intercept=True, score=0.023940 -   0.0s
[CV] normalize=False, alpha=100.0, fit_intercept=True ................
[CV]  normalize=False, alpha=100.0, fit_intercept=True, score=0.011642 -   0.0s
[CV] normalize=False, alpha=100.0, fit_intercept=True ................
[CV]  normalize=False, alpha=100.0, fit_intercept=True, score=0.018250 -   0.0s
[CV] normalize=False, alpha=100.0, fit_intercept=True ................
[CV]  normalize=False, alpha=100.0, fit_intercept=True, score=0.013910 -   0.0s
[CV] normalize=True, alpha=100.0, fi

[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  17 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  24 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  60 tasks       | elapsed:    0.2s


[CV]  normalize=True, alpha=0.001, fit_intercept=False, score=-3.285892 -   0.0s
[CV] normalize=True, alpha=0.001, fit_intercept=False ................
[CV]  normalize=True, alpha=0.001, fit_intercept=False, score=-3.620773 -   0.0s
[CV] normalize=False, alpha=0.001, fit_intercept=False ...............
[CV]  normalize=False, alpha=0.001, fit_intercept=False, score=-4.092060 -   0.0s
[CV] normalize=False, alpha=0.001, fit_intercept=False ...............
[CV]  normalize=False, alpha=0.001, fit_intercept=False, score=-3.285892 -   0.0s
[CV] normalize=False, alpha=0.001, fit_intercept=False ...............
[CV]  normalize=False, alpha=0.001, fit_intercept=False, score=-3.620773 -   0.0s
[CV] normalize=True, alpha=0.0001, fit_intercept=True ................
[CV]  normalize=True, alpha=0.0001, fit_intercept=True, score=0.469346 -   0.0s
[CV] normalize=True, alpha=0.0001, fit_intercept=True ................
[CV]  normalize=True, alpha=0.0001, fit_intercept=True, score=0.487330 -   0.0s
[CV] n

[Parallel(n_jobs=1)]: Done  71 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  84 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:    0.3s finished


In [46]:
print(grid.best_score_) ## returns the score of the best model

0.488914116941


In [47]:
print(grid.best_estimator_.alpha)  ## returns the best alpha

0.1


In [48]:
grid.best_estimator_.fit_intercept

True

In [49]:
grid.get_params

<bound method GridSearchCV.get_params of GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'normalize': [True, False], 'alpha': array([  1.00000e+02,   1.00000e+01,   1.00000e+00,   1.00000e-01,
         1.00000e-02,   1.00000e-03,   1.00000e-04,   0.00000e+00]), 'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=10)>

In [50]:
best_model = grid.best_estimator_

In [51]:
best_model.fit(dataset.data, dataset.target)

Ridge(alpha=0.10000000000000001, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=True, random_state=None, solver='auto',
   tol=0.001)

In [52]:
best_model.score(dataset.data, dataset.target) ## fits best model to dataset

0.51256297679609986

In [41]:
best_model.predict(dataset.data)  ## these are the predictions for the best model

array([ 205.80757936,   68.34874929,  176.57699985,  166.49798892,
        128.40964698,  106.25462462,   74.50309843,  119.33026934,
        158.9085443 ,  213.58399963,   97.54754149,   96.22454284,
        115.01053514,  164.31651367,  102.90296779,  176.68742712,
        211.37648874,  182.72995776,  147.79266309,  123.76461899,
        120.08956875,   86.48753745,  113.73716038,  255.06633073,
        165.23635806,  147.34683255,   97.04138567,  178.90987852,
        128.86078344,  184.57407373,  158.77444088,   69.48060683,
        260.69924608,  112.04487131,   78.67580976,   87.11642963,
        207.71318951,  157.39968146,  241.01705926,  136.76582605,
        153.46414027,   74.10155505,  145.34461129,   77.9830855 ,
        220.5638027 ,  125.06029696,  142.24244785,  109.17211779,
         73.72726192,  189.3090673 ,  157.87301908,  169.24945771,
        134.09305973,  157.71893916,  139.35332196,   72.92995428,
        207.41134515,   80.0683223 ,  103.00604013,  134.50634