# Lab - Regularization

## Week 4 Monday 31st October

In [11]:
## TASK: Regularized regression
## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Crime (n=319 non-null, p=122, type=regression)
## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

## This data set contains data on violent crimes within a community.

########## Prepare data ##########
# read in data, remove categorical features, remove rows with missing values
import pandas as pd
crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
crime = crime.iloc[:, 5:]
crime.dropna(inplace=True)
crime.head()

# dropna (inplace=True) drops  null values
# define X and y
X = crime.iloc[:, :-1]
y = crime.iloc[:, -1]
#crime.iloc--> choosing x and y 

# split into train/test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#traintest split the data into four columns -> test, train, x and y


In [12]:
# How many columns are in X?
X.shape()

TypeError: 'tuple' object is not callable

In [14]:
########## Linear Regression Model Without Regularization ##########
# linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
lm.coef_
# What are these numbers?

#Answer- The beta values
len(lm.coef_)



122

In [15]:
# make predictions and evaluate
import numpy as np
from sklearn import metrics
preds = lm.predict(X_test)
print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# RMSE is root mean square error.

RMSE (no regularization) = 0.233813676495


In [16]:
########## Ridge Regression Model ##########
# ridge regression (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Ridge
rreg = Ridge(alpha=0.1, normalize=True)
rreg.fit(X_train, y_train)
rreg.coef_
preds = rreg.predict(X_test)
print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# Is this model better? Why?
#The effect of the insignificant betas has been reduced

RMSE (Ridge reg.) = 0.164279068049


In [34]:
# use RidgeCV to select best alpha
from sklearn.linear_model import RidgeCV
# CV is cross validation. it divided data into K sets.If k=5 , it is 5 models. You test the data 5 times and avergae it out.
alpha_range = 10.**np.arange(-2, 3)
rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range)
rregcv.fit(X_train, y_train)
rregcv.alpha_
preds = rregcv.predict(X_test)
print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# What is the range of alpha values we are searching over?
print alpha_range
rregcv.alpha_

np.arange(-2,3)
print rregcv.coef_

RMSE (Ridge CV reg.) = 0.163129782343
[  1.00000000e-02   1.00000000e-01   1.00000000e+00   1.00000000e+01
   1.00000000e+02]
[ -1.35479199e-03   3.66493623e-03   5.67246153e-02  -6.65596102e-02
   7.50154730e-03   3.73570277e-03   1.48458510e-02  -8.26212596e-03
  -9.14751985e-04   8.17782143e-03  -9.58221848e-04   3.99333039e-03
  -2.04774531e-02  -3.80310378e-02  -7.06311041e-02  -9.07995340e-02
   3.92365601e-03   3.68491166e-02  -2.35269424e-02  -1.36618143e-02
  -9.83437557e-03   1.30086791e-02  -3.43297706e-02  -5.04638755e-02
  -9.82883411e-04   7.47392898e-02   2.63572032e-02  -1.07987605e-02
   3.16035521e-02  -2.17283831e-02  -4.45588182e-03  -1.06490401e-02
   4.42829964e-02  -3.72944143e-02  -6.18713730e-02   3.20124805e-02
   5.85549588e-03  -1.23569409e-02   6.53560040e-02   3.46461301e-02
   6.00524147e-02   6.39805254e-02   2.58651194e-02  -6.73126020e-02
  -7.02669216e-02  -5.05555985e-02  -6.41318316e-02   8.24959798e-03
   9.27945661e-03   2.77399795e-03   5.2665016

In [37]:
########## Lasso Regression Model ##########
# lasso (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Lasso
las = Lasso(alpha=10
            , normalize=True)
las.fit(X_train, y_train)
print las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

[ 0.  0.  0. -0. -0.  0.  0.  0.  0. -0.  0.  0. -0. -0. -0. -0.  0.  0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0.  0.  0.  0.  0. -0.  0. -0. -0.  0.
  0. -0.  0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0. -0.  0.  0.  0.  0.  0.  0. -0.  0.  0. -0.  0.
 -0. -0.  0.  0. -0.  0.  0. -0. -0. -0. -0. -0. -0. -0.  0. -0.  0.  0.
  0.  0.  0. -0.  0.  0.  0.  0. -0.  0.  0.  0.  0.  0. -0. -0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0.  0.  0.  0.]
RMSE (Lasso reg.) = 0.242253664949


In [29]:
# try a smaller alpha
las = Lasso(alpha=.001, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
las.coef_

RMSE (Lasso reg.) = 0.160039024044


AttributeError: 'Lasso' object has no attribute 'alpha_'

In [31]:
# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV
lascv = LassoCV(normalize=True, alphas=alpha_range,cv=5)
lascv.fit(X_train, y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
lascv.alpha_
lascv.coef_

RMSE (Lasso CV reg.) = 0.198165225429


0.01

In [None]:
# Some takeaways

# Ridge regression

### Lookup [Elastic Net](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) and complete the following.



1. What is elastic net?
2. How does it work?
3. Run elastic net on the above dataset