# Machine Learning: Regression (Module 2, week 5) - LASSO Regression
Keywords: LASSO regression, L1 Penalty, Coordinate Descent

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model

In [8]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [9]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [10]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [11]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
model_all.coef_

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

### Features from LASSO:

In [33]:
f = lambda x: all_features[x]
features = np.nonzero(model_all.coef_)[0]
np.array([f(xi) for xi in features])

array(['sqft_living', 'view', 'grade'], 
      dtype='|S11')

In [34]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [35]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [40]:
penalties = np.logspace(1, 7, num=13)

In [65]:
for l1_penalty in penalties:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price']) # learn weights
    print "RSS for validation set with l1=%s: %s" %(l1_penalty, np.sum((validation['price'] - model.predict(validation[all_features]))**2))

RSS for validation set with l1=10.0: 3.982133273e+14
RSS for validation set with l1=31.6227766017: 3.99041900253e+14
RSS for validation set with l1=100.0: 4.29791604073e+14
RSS for validation set with l1=316.227766017: 4.63739831045e+14
RSS for validation set with l1=1000.0: 6.45898733634e+14
RSS for validation set with l1=3162.27766017: 1.22250685943e+15
RSS for validation set with l1=10000.0: 1.22250685943e+15
RSS for validation set with l1=31622.7766017: 1.22250685943e+15
RSS for validation set with l1=100000.0: 1.22250685943e+15
RSS for validation set with l1=316227.766017: 1.22250685943e+15
RSS for validation set with l1=1000000.0: 1.22250685943e+15
RSS for validation set with l1=3162277.66017: 1.22250685943e+15
RSS for validation set with l1=10000000.0: 1.22250685943e+15


The l1=10.0 penalty has the lowest RSS

In [46]:
best_penalty = penalties[0]
model = linear_model.Lasso(alpha=best_penalty, normalize=True)
model.fit(training[all_features], training['price']) # learn weights
print "RSS for test set with l1=%s: %s" %(best_penalty, np.sum((testing['price'] - model.predict(testing[all_features]))**2))

RSS for test set with l1=10.0: 9.84674025527e+13


In [47]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

### Find sparse model with 7 coefficients

In [56]:
non_zeros = []
max_nonzeros = 7
for l1_penalty in np.logspace(1, 4, num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zero = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    print "non zero for l1_penalty=", l1_penalty, ": ", non_zero
    non_zeros.append(l1_penalty)

non zero for l1_penalty= 10.0 :  15
non zero for l1_penalty= 14.3844988829 :  15
non zero for l1_penalty= 20.6913808111 :  15
non zero for l1_penalty= 29.7635144163 :  15
non zero for l1_penalty= 42.8133239872 :  13
non zero for l1_penalty= 61.5848211066 :  12
non zero for l1_penalty= 88.586679041 :  11
non zero for l1_penalty= 127.42749857 :  10
non zero for l1_penalty= 183.298071083 :  7
non zero for l1_penalty= 263.665089873 :  6
non zero for l1_penalty= 379.269019073 :  6
non zero for l1_penalty= 545.559478117 :  6
non zero for l1_penalty= 784.759970351 :  5
non zero for l1_penalty= 1128.83789168 :  3
non zero for l1_penalty= 1623.77673919 :  3
non zero for l1_penalty= 2335.72146909 :  2
non zero for l1_penalty= 3359.81828628 :  1
non zero for l1_penalty= 4832.93023857 :  1
non zero for l1_penalty= 6951.92796178 :  1
non zero for l1_penalty= 10000.0 :  1


In [60]:
l1_penalty_min = non_zeros[7] # 127.42749857
l1_penalty_max = non_zeros[9] # 263.665089873

### 13. Quiz Question: What values did you find for l1_penalty_min and l1_penalty_max?

In [62]:
l1_penalty_min

127.42749857031335

In [63]:
l1_penalty_max

263.66508987303581

In [74]:
rsss = []
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zero = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    rss = np.sum((validation['price'] - model.predict(validation[all_features]))**2)
    rsss.append(rss)
    features = np.nonzero(model.coef_)[0]
    features_as_strings = np.array([f(xi) for xi in features])    
    print "l1=%s RSS = %s, non zero=%s, features= %s" %(l1_penalty, rss, non_zero, features_as_strings)

l1=127.42749857 RSS = 4.35374677103e+14, non zero=10, features= ['bedrooms' 'bathrooms' 'sqft_living' 'sqft_lot_sqrt' 'floors_square'
 'waterfront' 'view' 'grade' 'yr_built']
l1=134.597898113 RSS = 4.37009229124e+14, non zero=10, features= ['bedrooms' 'bathrooms' 'sqft_living' 'sqft_lot_sqrt' 'floors_square'
 'waterfront' 'view' 'grade' 'yr_built']
l1=141.768297655 RSS = 4.38236128387e+14, non zero=8, features= ['bathrooms' 'sqft_living' 'sqft_lot_sqrt' 'waterfront' 'view' 'grade'
 'yr_built']
l1=148.938697197 RSS = 4.391589378e+14, non zero=8, features= ['bathrooms' 'sqft_living' 'sqft_lot_sqrt' 'waterfront' 'view' 'grade'
 'yr_built']
l1=156.109096739 RSS = 4.40037365263e+14, non zero=7, features= ['bathrooms' 'sqft_living' 'waterfront' 'view' 'grade' 'yr_built']
l1=163.279496282 RSS = 4.40777489642e+14, non zero=7, features= ['bathrooms' 'sqft_living' 'waterfront' 'view' 'grade' 'yr_built']
l1=170.449895824 RSS = 4.4156669809e+14, non zero=7, features= ['bathrooms' 'sqft_living' 'wa

In [70]:
rsss[4]

440037365263316.8