# Lab - Regularization

## Week 4 Monday 31st October

In [1]:
## TASK: Regularized regression
## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Crime (n=319 non-null, p=122, type=regression)
## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

## This data set contains data on violent crimes within a community.

########## Prepare data ##########
# read in data, remove categorical features, remove rows with missing values
import pandas as pd
crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
crime = crime.iloc[:, 5:]
crime.dropna(inplace=True)
crime.head()

# define X and y
X = crime.iloc[:, :-1]
y = crime.iloc[:, -1]

# split into train/test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [5]:
# How many columns are in X?
len(X.columns)

122

In [10]:
########## Linear Regression Model Without Regularization ##########
# linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
lm.coef_
# What are these numbers?
len(lm.coef_)

122

In [11]:
# make predictions and evaluate
import numpy as np
from sklearn import metrics
preds = lm.predict(X_test)
print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (no regularization) = 0.233813676495


In [14]:
########## Ridge Regression Model ##########
# ridge regression (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Ridge
rreg = Ridge(alpha=0.1, normalize=True)
rreg.fit(X_train, y_train)
rreg.coef_
preds = rreg.predict(X_test)
print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# Is this model better? Why?

RMSE (Ridge reg.) = 0.164279068049


In [16]:
# use RidgeCV to select best alpha CROSS VALIDATION, try a variety of alphas and pick the best one
from sklearn.linear_model import RidgeCV
alpha_range = 10.**np.arange(-2, 3)
rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range)
#rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range, cv=5) Specify 5 different alphas
rregcv.fit(X_train, y_train)
rregcv.alpha_
preds = rregcv.predict(X_test)
print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# What is the range of alpha values we are searching over?
#alpha_range -> show range of alphas used
#rregcv.alpha_ -> tells you which alpha was best

RMSE (Ridge CV reg.) = 0.163129782343


1.0

In [17]:
########## Lasso Regression Model ##########
# lasso (alpha must be positive, larger means more regularization)
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

RMSE (Lasso reg.) = 0.198165225429


In [18]:
# try a smaller alpha
las = Lasso(alpha=0.0001, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))


RMSE (Lasso reg.) = 0.164502413721


In [35]:
# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV
alpha_range = 0.01**np.arange(-2, 3)
lascv = LassoCV(normalize=True, alphas=alpha_range)
#lascv = LassoCV(normalize=True, alphas=alpha_range, cv=5)
lascv.fit(X_train, y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
print 'RMSE (Lasso CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
lascv.alpha_
#lascv.coef_

RMSE (Lasso CV reg.) = 0.164502413721


0.0001

### Lookup [Elastic Net](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) and complete the following.



1. What is elastic net?
2. How does it work?
3. Run elastic net on the above dataset

In [61]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=0.01, l1_ratio=0.6)
enet.fit(X_train, y_train)
enet.coef_
preds = enet.predict(X_test)
print 'RMSE (Elastic Net reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
enet.coef_

RMSE (Elastic Net reg.) = 0.160764933029


array([ 0.        ,  0.        ,  0.09691076, -0.21215794,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.        , -0.        , -0.        ,
       -0.        ,  0.        ,  0.05902445, -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
        0.        , -0.        ,  0.        ,  0.07594937,  0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.        ,
        0.        ,  0.        , -0.        ,  0.04466404,  0.        ,
        0.        ,  0.01490726,  0.        , -0.06309985, -0.1350178 ,
       -0.03912442, -0.        , -0.        , -0.        ,  0.        ,
        0.14438606,  0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        ,  0.        , -0.        ,
       -0.        ,  0.        , -0.        ,  0.00923399,  0.  