# Lab - Regularization

## Week 4 Monday 31st October

In [9]:
## TASK: Regularized regression
## FUNCTIONS: Ridge, RidgeCV, Lasso, LassoCV
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Crime (n=319 non-null, p=122, type=regression)
## DATA DICTIONARY: http://archive.ics.uci.edu/ml/datasets/Communities+and+Crime

## This data set contains data on violent crimes within a community.

########## Prepare data ##########
# read in data, remove categorical features, remove rows with missing values
import pandas as pd
crime = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?'])
crime = crime.iloc[:, 5:]
crime.dropna(inplace=True) #if blank, drop blanks
crime.head()

# define X and y
X = crime.iloc[:, :-1] #: takes all rows
y = crime.iloc[:, -1]

# split into train/test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [12]:
# How many columns are in X?
X.shape
len(X.columns)

122

In [15]:
########## Linear Regression Model Without Regularization ##########
# linear regression
from sklearn.linear_model import LinearRegression
lm = LinearRegression() #create object
lm.fit(X_train, y_train) #fit to data
lm.coef_
# What are these numbers? #betas/coefficients

array([ -3.66188167e+00,   6.98124465e-01,  -2.61955467e-01,
        -2.85270027e-01,  -1.64740837e-01,   2.46972333e-01,
        -1.09290051e+00,  -5.96857796e-01,   1.11200239e+00,
        -7.21968931e-01,   4.27346598e+00,  -2.28040268e-01,
         8.04875769e-01,  -2.57934732e-01,  -2.63458023e-01,
        -1.04616958e+00,   6.07784197e-01,   7.73552561e-01,
         5.96468029e-02,   6.90215922e-01,   2.16759430e-02,
        -4.87802949e-01,  -5.18858404e-01,   1.39478815e-01,
        -1.24417942e-01,   3.15003821e-01,  -1.52633736e-01,
        -9.65003927e-01,   1.17142163e+00,  -3.08546690e-02,
        -9.29085548e-01,   1.24654586e-01,   1.98104506e-01,
         7.30804821e-01,  -1.77337294e-01,   8.32927588e-02,
         3.46045601e-01,   5.01837338e-01,   1.57062958e+00,
        -4.13478807e-01,   1.39350802e+00,  -3.49428114e+00,
         7.09577818e-01,  -8.32141352e-01,  -1.39984927e+00,
         1.02482840e+00,   2.13855006e-01,  -6.18937325e-01,
         5.28954490e-01,

In [16]:
# make predictions and evaluate
import numpy as np
from sklearn import metrics
preds = lm.predict(X_test)
print 'RMSE (no regularization) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) #Root mean squared error - square errors, find average, square root

RMSE (no regularization) = 0.233813676495


In [17]:
########## Ridge Regression Model ##########
# ridge regression (alpha must be positive, larger means more regularization - increasing effect)
from sklearn.linear_model import Ridge
rreg = Ridge(alpha=0.1, normalize=True)
rreg.fit(X_train, y_train)
rreg.coef_
preds = rreg.predict(X_test)
print 'RMSE (Ridge reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# Is this model better? Why? #yes - reduction in RMSE big jump, meaning features causing model not to fit as well have been suppressed

RMSE (Ridge reg.) = 0.164279068049


In [18]:
len(lm.coef_) #stays the same, as only coefficient (effect) of features are being reduced, not removing features

122

In [22]:
# use RidgeCV to select best alpha - including cross validation
from sklearn.linear_model import RidgeCV
alpha_range = 10.**np.arange(-2, 3) #more than one alpha
rregcv = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=alpha_range, cv=5) #cv=k-fold cross validation - 25 models as 5 alphas & 5 CV 
rregcv.fit(X_train, y_train)
rregcv.alpha_
preds = rregcv.predict(X_test) 
print 'RMSE (Ridge CV reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
# What is the range of alpha values we are searching over?
alpha_range
rregcv.alpha_ #

RMSE (Ridge CV reg.) = 0.160913596522


10.0

In [23]:
########## Lasso Regression Model ##########
# lasso (alpha must be positive, larger means more regularization) takes coefficients to zero
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
#comapre to ridge regression - ridge was better as closer to 0

RMSE (Lasso reg.) = 0.198165225429


In [28]:
# try a smaller alpha
las = Lasso(alpha=0.0001, normalize=True) #smaller alpha, need greater alpha in ridge to achieve same thing as small alpha in lasso
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))
lascv.alpha_
las.coef_ #notice number of coefficients have been reduced to 0

RMSE (Lasso reg.) = 0.164502413721


array([ 0.        ,  0.        ,  0.        , -0.00418625, -0.        ,
        0.09573169, -0.        , -0.        , -0.        ,  0.        ,
        0.        , -0.0308013 ,  0.        , -0.09352469, -0.08937229,
       -0.58628915,  0.        ,  0.16706299, -0.06130851,  0.11966402,
       -0.        , -0.        , -0.10046533, -0.01816658, -0.        ,
        0.19148624,  0.0281072 , -0.21042443,  0.        , -0.06733447,
       -0.30656472,  0.        ,  0.12856465, -0.        , -0.18235778,
        0.05534065,  0.14885766,  0.        ,  0.        ,  0.        ,
       -0.07395512, -0.        ,  0.16423116, -0.        , -0.3780258 ,
       -0.        , -0.        , -0.        ,  0.03341259, -0.        ,
        0.22040155, -0.02724104,  0.        , -0.10195256,  0.06214765,
        0.0274531 ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        , -0.2690561 ,  0.        ,  0.        ,  0.35

In [None]:
#use gridsearch

### Lookup [Elastic Net](http://scikit-learn.org/stable/modules/linear_model.html#elastic-net) and complete the following.



1. What is elastic net?
2. How does it work?
3. Run elastic net on the above dataset