In [1]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [2]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [3]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqft_living_sqrt,sqft_lot_sqrt,bedrooms_square,floors_square
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,34.351128,75.166482,9.0,1.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,50.695167,85.099941,9.0,4.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,27.748874,100.0,4.0,1.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,44.271887,70.710678,16.0,1.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,40.987803,89.88882,9.0,1.0


In [4]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [5]:
model_all.coef_

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

* sqft_living
* view
* grade

In [7]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [35]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [36]:
import numpy as np
l1_penalties = np.logspace(1, 7, num=13)

In [48]:
for l1_penalty in l1_penalties:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price']) # learn weights
    predictions = model.predict(validation[all_features])
    RSS = np.sum((predictions - validation['price']) ** 2)
    print 'RSS: ' + str(RSS) + ' l1_penalty: ' + str(l1_penalty)

RSS: 3.982133273e+14 l1_penalty: 10.0
RSS: 3.96831833944e+14 l1_penalty: 14.3844988829
RSS: 3.96210901853e+14 l1_penalty: 20.6913808111
RSS: 3.98215534575e+14 l1_penalty: 29.7635144163
RSS: 4.0687725852e+14 l1_penalty: 42.8133239872
RSS: 4.24647490491e+14 l1_penalty: 61.5848211066
RSS: 4.27906308934e+14 l1_penalty: 88.586679041
RSS: 4.35374677103e+14 l1_penalty: 127.42749857
RSS: 4.43107216261e+14 l1_penalty: 183.298071083
RSS: 4.54176669663e+14 l1_penalty: 263.665089873
RSS: 4.78132980832e+14 l1_penalty: 379.269019073
RSS: 5.31397181867e+14 l1_penalty: 545.559478117
RSS: 5.94043306274e+14 l1_penalty: 784.759970351
RSS: 6.74059169986e+14 l1_penalty: 1128.83789168
RSS: 8.02609410823e+14 l1_penalty: 1623.77673919
RSS: 1.06125525287e+15 l1_penalty: 2335.72146909
RSS: 1.22250685943e+15 l1_penalty: 3359.81828628
RSS: 1.22250685943e+15 l1_penalty: 4832.93023857
RSS: 1.22250685943e+15 l1_penalty: 6951.92796178
RSS: 1.22250685943e+15 l1_penalty: 10000.0


In [49]:
model = linear_model.Lasso(alpha=20.6913808111, normalize=True)
model.fit(training[all_features], training['price']) # learn weights

Lasso(alpha=20.6913808111, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [50]:
model.coef_

array([ -1.67201164e+04,   2.46161126e+02,   4.67505582e+04,
         5.20597513e+02,  -3.47045876e+04,   5.16827492e-01,
        -5.31184505e+02,   0.00000000e+00,   5.18627794e+03,
         6.09205096e+05,   3.92492514e+04,   2.12654824e+04,
         1.26929364e+05,   0.00000000e+00,   0.00000000e+00,
        -3.31337004e+03,   7.23386017e+00])

In [51]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)


15

In [52]:
max_nonzeros = 7

In [53]:
l1_penalties = np.logspace(1, 4, num=20)

In [54]:
nonzeros = list()
for l1_penalty in l1_penalties:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price']) # learn weights
    predictions = model.predict(validation[all_features])
    RSS = np.sum((predictions - validation['price']) ** 2)
    print '===================='
    print 'l1_penalty: ' + str(l1_penalty)
    print 'RSS: ' + str(RSS)
    print 'nonzeros: ' + str(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    nonzeros.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))


l1_penalty: 10.0
RSS: 3.982133273e+14
nonzeros: 15
l1_penalty: 14.3844988829
RSS: 3.96831833944e+14
nonzeros: 15
l1_penalty: 20.6913808111
RSS: 3.96210901853e+14
nonzeros: 15
l1_penalty: 29.7635144163
RSS: 3.98215534575e+14
nonzeros: 15
l1_penalty: 42.8133239872
RSS: 4.0687725852e+14
nonzeros: 13
l1_penalty: 61.5848211066
RSS: 4.24647490491e+14
nonzeros: 12
l1_penalty: 88.586679041
RSS: 4.27906308934e+14
nonzeros: 11
l1_penalty: 127.42749857
RSS: 4.35374677103e+14
nonzeros: 10
l1_penalty: 183.298071083
RSS: 4.43107216261e+14
nonzeros: 7
l1_penalty: 263.665089873
RSS: 4.54176669663e+14
nonzeros: 6
l1_penalty: 379.269019073
RSS: 4.78132980832e+14
nonzeros: 6
l1_penalty: 545.559478117
RSS: 5.31397181867e+14
nonzeros: 6
l1_penalty: 784.759970351
RSS: 5.94043306274e+14
nonzeros: 5
l1_penalty: 1128.83789168
RSS: 6.74059169986e+14
nonzeros: 3
l1_penalty: 1623.77673919
RSS: 8.02609410823e+14
nonzeros: 3
l1_penalty: 2335.72146909
RSS: 1.06125525287e+15
nonzeros: 2
l1_penalty: 3359.81828628
RSS:

In [55]:
l1_penalty_min = 183.298071083
l1_penalty_max = 10000.0

In [56]:
nonzeros_2 = list()
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price']) # learn weights
    predictions = model.predict(validation[all_features])
    RSS = np.sum((predictions - validation['price']) ** 2)
    print '===================='
    print 'l1_penalty: ' + str(l1_penalty)
    print 'RSS: ' + str(RSS)
    print 'nonzeros: ' + str(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    nonzeros_2.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))

l1_penalty: 183.298071083
RSS: 4.43107216261e+14
nonzeros: 7
l1_penalty: 699.966593658
RSS: 5.73440541659e+14
nonzeros: 5
l1_penalty: 1216.63511623
RSS: 6.93148642947e+14
nonzeros: 3
l1_penalty: 1733.30363881
RSS: 8.37930324858e+14
nonzeros: 3
l1_penalty: 2249.97216138
RSS: 1.02773787124e+15
nonzeros: 2
l1_penalty: 2766.64068396
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 3283.30920653
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 3799.97772911
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 4316.64625168
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 4833.31477425
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 5349.98329683
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 5866.6518194
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 6383.32034198
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 6899.98886455
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 7416.65738713
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 7933.3259097
RSS: 1.22250685943e+15
nonzeros: 1
l1_penalty: 8449.99443228


In [57]:
    model = linear_model.Lasso(alpha=183.298071083, normalize=True)
    model.fit(training[all_features], training['price']) # learn weights


Lasso(alpha=183.298071083, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [58]:
model.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   4.84964317e+03,
         1.65210126e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         4.84780808e+05,   4.14997727e+04,   0.00000000e+00,
         1.13406888e+05,   0.00000000e+00,   0.00000000e+00,
        -2.41386679e+03,   0.00000000e+00])

In [59]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

* bathrooms
* sqft_living
* waterfront
* view
* grade
* yr_built