# Lab - Regularization

## Week 4 Monday 31st October

In [28]:
## TASK: Regularized regression
## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Crime (n=319 non-null, p=122, type=regression)
## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

## This data set contains data on violent crimes within a community.

########## Prepare data ##########
# read in data, remove categorical features, remove rows with missing values
import pandas as pd
crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
crime = crime.iloc[:, 5:]
crime.dropna(inplace=True)
crime.head()

# define X and y
X = crime.iloc[:, :-1] #selection of columns, all the columns except the last one.
y = crime.iloc[:, -1]  #use the last column for y

# split into train/test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [29]:
# How many columns are in X?

X.shape
len(X.columns)

122

In [31]:
########## Linear Regression Model Without Regularization ##########
# linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
print lm.coef_
# What are these numbers?
#outputs are the betas
print lm.intercept_

[ -3.66188167e+00   6.98124465e-01  -2.61955467e-01  -2.85270027e-01
  -1.64740837e-01   2.46972333e-01  -1.09290051e+00  -5.96857796e-01
   1.11200239e+00  -7.21968931e-01   4.27346598e+00  -2.28040268e-01
   8.04875769e-01  -2.57934732e-01  -2.63458023e-01  -1.04616958e+00
   6.07784197e-01   7.73552561e-01   5.96468029e-02   6.90215922e-01
   2.16759430e-02  -4.87802949e-01  -5.18858404e-01   1.39478815e-01
  -1.24417942e-01   3.15003821e-01  -1.52633736e-01  -9.65003927e-01
   1.17142163e+00  -3.08546690e-02  -9.29085548e-01   1.24654586e-01
   1.98104506e-01   7.30804821e-01  -1.77337294e-01   8.32927588e-02
   3.46045601e-01   5.01837338e-01   1.57062958e+00  -4.13478807e-01
   1.39350802e+00  -3.49428114e+00   7.09577818e-01  -8.32141352e-01
  -1.39984927e+00   1.02482840e+00   2.13855006e-01  -6.18937325e-01
   5.28954490e-01   7.98294890e-02   5.93688560e-02  -1.68582667e-01
   7.31264051e-01  -1.39635208e+00   2.38507704e-01   5.50621439e-01
  -5.61447867e-01   6.18989764e-01

In [32]:
# make predictions and evaluate
import numpy as np
from sklearn import metrics
preds = lm.predict(X_test)
print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (no regularization) = 0.233813676495


In [35]:
########## Ridge Regression Model ##########
# ridge regression (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Ridge
rreg = Ridge(alpha=0.1, normalize=True)
rreg.fit(X_train, y_train)
rreg.coef_
preds = rreg.predict(X_test)
print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

print len(rreg.coef_)
# Is this model better? Why?

RMSE (Ridge reg.) = 0.164279068049
122


In [41]:
# use RidgeCV to select best alpha
from sklearn.linear_model import RidgeCV
alpha_range = 10.**np.arange(-2, 3)
#CV = indicates cross validation
rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range, cv =5)
rregcv.fit(X_train, y_train)
print rregcv.coef_
rregcv.alpha_
preds = rregcv.predict(X_test)
print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# What is the range of alpha values we are searching over?

[-0.00482658 -0.00351569  0.09529245 -0.0823936   0.00660951  0.00856796
  0.00346223 -0.00328266 -0.00123173  0.00902257 -0.00402771  0.00441976
 -0.01782851 -0.02577192 -0.02310209 -0.0520702   0.00543002  0.04366596
 -0.01337381 -0.01310416 -0.01011989  0.00281832 -0.01427201 -0.02025864
  0.00054812  0.02985664  0.01234123 -0.01895408  0.03093172 -0.01803204
 -0.00276177 -0.01203998  0.0423571  -0.02145591 -0.05248999  0.01987457
  0.00313914 -0.00854706  0.04677342  0.02470129  0.03504253  0.04250225
  0.01025187 -0.06264545 -0.06657827 -0.05464855 -0.05457575  0.00249165
  0.00280561 -0.00408854  0.08426757 -0.00455953 -0.00733961 -0.01298227
  0.0064014   0.00840808  0.00040815  0.00587591  0.01376979  0.01271106
 -0.00248731  0.00426768  0.0224114   0.0113809  -0.00924104  0.00358868
 -0.01503612 -0.01466317  0.02776629  0.04172352 -0.0053995   0.02570923
 -0.02142532 -0.01396027  0.02635     0.00272066  0.00415215  0.02936341
  0.00829837 -0.01217689 -0.0108787  -0.00649211 -0

In [43]:
########## Lasso Regression Model ##########
# lasso (alpha must be positive, larger means more regularization)
#using absolute values instead of squares for lasso regression
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01, normalize=True)
las.fit(X_train, y_train)
print las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

[ 0.          0.          0.         -0.03974695  0.          0.          0.
  0.          0.         -0.          0.          0.         -0.         -0.
 -0.         -0.         -0.          0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.          0.
  0.          0.          0.         -0.          0.         -0.         -0.
  0.          0.         -0.          0.          0.          0.          0.
  0.         -0.         -0.27503063 -0.         -0.         -0.         -0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -0.          0.          0.
  0.          0.          0.          0.         -0.          0.          0.
 -0.          0.         -0.         -0.          0.          0.         -0.
  0.          0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.         -0.          0.          0.

In [45]:
# try a smaller alpha
las = Lasso(alpha=0.0001, normalize=True)
las.fit(X_train, y_train)
print las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))


[ 0.          0.          0.         -0.00418625 -0.          0.09573169
 -0.         -0.         -0.          0.          0.         -0.0308013   0.
 -0.09352469 -0.08937229 -0.58628915  0.          0.16706299 -0.06130851
  0.11966402 -0.         -0.         -0.10046533 -0.01816658 -0.
  0.19148624  0.0281072  -0.21042443  0.         -0.06733447 -0.30656472
  0.          0.12856465 -0.         -0.18235778  0.05534065  0.14885766
  0.          0.          0.         -0.07395512 -0.          0.16423116
 -0.         -0.3780258  -0.         -0.         -0.          0.03341259
 -0.          0.22040155 -0.02724104  0.         -0.10195256  0.06214765
  0.0274531   0.          0.          0.          0.         -0.          0.
  0.          0.          0.          0.         -0.2690561   0.          0.
  0.35611039  0.04332233  0.04845138  0.0257534   0.12236177 -0.00563988
  0.          0.          0.         -0.         -0.         -0.0261296  -0.
 -0.36526572  0.          0.          0.389

In [49]:
# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV
alpha_range = 0.01**np.arange(-2, 3)
lascv = LassoCV(normalize=True, alphas=alpha_range)
lascv.fit(X_train, y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
lascv.coef_

RMSE (Lasso CV reg.) = 0.164502413721


array([ 0.        ,  0.        ,  0.        , -0.00418625, -0.        ,
        0.09573169, -0.        , -0.        , -0.        ,  0.        ,
        0.        , -0.0308013 ,  0.        , -0.09352469, -0.08937229,
       -0.58628915,  0.        ,  0.16706299, -0.06130851,  0.11966402,
       -0.        , -0.        , -0.10046533, -0.01816658, -0.        ,
        0.19148624,  0.0281072 , -0.21042443,  0.        , -0.06733447,
       -0.30656472,  0.        ,  0.12856465, -0.        , -0.18235778,
        0.05534065,  0.14885766,  0.        ,  0.        ,  0.        ,
       -0.07395512, -0.        ,  0.16423116, -0.        , -0.3780258 ,
       -0.        , -0.        , -0.        ,  0.03341259, -0.        ,
        0.22040155, -0.02724104,  0.        , -0.10195256,  0.06214765,
        0.0274531 ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.2690561 ,  0.        ,  0.        ,  0.35

### Lookup [Elastic Net](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) and complete the following.



1. What is elastic net?
2. How does it work?
3. Run elastic net on the above dataset

In [57]:
from sklearn.linear_model import ElasticNet

ENet = ElasticNet(alpha=0.001, normalize = True)
ENet.fit(X_train, y_train)
ENet.coef_

ENetpreds = ENet.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, ENetpreds))


RMSE (Lasso reg.) = 0.163708913832


In [63]:
from sklearn.linear_model import ElasticNetCV
alpha_range = 0.01**np.arange(-2, 3)
ENetCV = ElasticNetCV(n_alphas= 1000, normalize = True)
ENetCV.fit(X_train, y_train)
print ENetCV.coef_
print ENetCV.alpha_

ENetCVpreds = ENetCV.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, ENetCVpreds))


[ 0.          0.          0.0667676  -0.1336344   0.          0.          0.
  0.          0.          0.          0.          0.         -0.         -0.
 -0.         -0.13697437  0.          0.03328816 -0.         -0.         -0.
 -0.         -0.         -0.         -0.          0.         -0.          0.
  0.07126256  0.          0.         -0.          0.03817361 -0.
 -0.02827555  0.          0.         -0.          0.05533971  0.
  0.04954886  0.06513415  0.         -0.09542832 -0.11277412 -0.04011909
 -0.05687071 -0.         -0.          0.          0.06050503  0.          0.
  0.          0.          0.          0.          0.          0.          0.
 -0.          0.          0.          0.          0.          0.          0.
 -0.          0.00266879  0.0784738  -0.          0.00032215 -0.         -0.
  0.00376984  0.          0.          0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.          0.         -0.
 -0.          0.02320405  0