In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_california_housing

In [3]:
cali = fetch_california_housing()

In [4]:
df = pd.DataFrame(cali.data, columns = cali.feature_names)

In [5]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[['HouseAge', 'AveBedrms']], cali.target)

In [8]:
sscaler = StandardScaler()
sscaler.fit(X_train) # gives us mean, stdev, and variance

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
sscaler.mean_ # mean of house age, mean of average bedrooms, used to scale each column

array([28.56401809,  1.09641356])

In [11]:
sscaler.var_ # variance of each column, used to scale each column

array([159.08776215,   0.20676351])

In [12]:
sscaler.scale_ # standard deviation of each column, used to scale each column

array([12.61299973,  0.45471256])

In [13]:
X_train_scaled = sscaler.transform(X_train)

In [14]:
X_test_scaled = sscaler.transform(X_test)

In [15]:
ridgecv = RidgeCV()
lassocv = LassoCV()
enetcv = ElasticNetCV()

In [16]:
ridgecv.fit(X_train_scaled, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [17]:
ridgecv.alpha_ # best alpha value to optimize Ridge penalty = 10.0

10.0

In [18]:
ridge_best = Ridge(alpha = 10.0)

In [23]:
# cross-validate using best alpha with any scoring metric (R^2, MSE, RMSE, MAE, MedAE)
cross_validate(ridge_best, X_train_scaled, y_train, scoring=('r2'), cv=5)



{'fit_time': array([0.00314403, 0.00241899, 0.00268626, 0.00219274, 0.00226283]),
 'score_time': array([0.00070906, 0.00067592, 0.00071192, 0.00054026, 0.00068712]),
 'test_score': array([0.01020229, 0.0080187 , 0.0138612 , 0.01231318, 0.0155302 ]),
 'train_score': array([0.01378763, 0.01384988, 0.01270479, 0.01319786, 0.01220097])}

In [24]:
import sklearn
sklearn.metrics.SCORERS.keys() # displays all available scoring metrics

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])