In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [6]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
model_all.intercept_, model_all.coef_

(-218136.21403515921,
 array([     0.        ,      0.        ,      0.        ,    134.43931396,
             0.        ,      0.        ,      0.        ,      0.        ,
             0.        ,      0.        ,  24750.00458561,      0.        ,
         61749.10309071,      0.        ,      0.        ,     -0.        ,
             0.        ]))

In [11]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [12]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [13]:
np.logspace(1, 7, num=13)

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [14]:
l1_penalty_list = np.logspace(1, 7, num=13)

In [21]:
for l1_penalty in l1_penalty_list:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True) # set parameters
    model.fit(training[all_features], training['price']) # learn weights
    pred = model.predict(validation[all_features])
    print sum((pred-validation['price'])**2)

3.982133273e+14
3.99041900253e+14
4.29791604073e+14
4.63739831045e+14
6.45898733634e+14
1.22250685943e+15
1.22250685943e+15
1.22250685943e+15
1.22250685943e+15
1.22250685943e+15
1.22250685943e+15
1.22250685943e+15
1.22250685943e+15


In [22]:
best_l1_penalty = l1_penalty_list[0]

In [23]:
best_l1_penalty

10.0

In [24]:
model = linear_model.Lasso(alpha=best_l1_penalty, normalize=True) # set parameters
model.fit(training[all_features], training['price']) # learn weights
pred = model.predict(testing[all_features])
print sum((pred-testing['price'])**2)

9.84674025527e+13


In [25]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

In [28]:
model.coef_, model.intercept_

(array([ -1.61445628e+04,   3.73245384e+02,   5.08412433e+04,
          6.17853560e+02,  -4.44113549e+04,   7.85623065e-01,
         -7.01194765e+02,  -0.00000000e+00,   5.01420046e+03,
          6.19488752e+05,   3.80418557e+04,   2.49987718e+04,
          1.28716235e+05,   0.00000000e+00,   0.00000000e+00,
         -3.29383118e+03,   1.00573209e+01]), 6630155.6686283741)

In [42]:
max_nonzeros = 7
l1_penalty_list = np.logspace(1, 4, num=20)
l1_penalty_min = l1_penalty_list[0]
l1_penalty_max = l1_penalty_list[0]
res = []

for l1_penalty in l1_penalty_list:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True) # set parameters
    model.fit(training[all_features], training['price']) # learn weights
    n_features = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    print l1_penalty, n_features
    res.append([l1_penalty, n_features])

l1_penalty_min = max([x[0] for x in res if x[1] > max_nonzeros])
l1_penalty_max = min([x[0] for x in res if x[1] < max_nonzeros])

#    pred = model.predict(validation[all_features])
#    print sum((pred-validation['price'])**2)

10.0 15
14.3844988829 15
20.6913808111 15
29.7635144163 15
42.8133239872 13
61.5848211066 12
88.586679041 11
127.42749857 10
183.298071083 7
263.665089873 6
379.269019073 6
545.559478117 6
784.759970351 5
1128.83789168 3
1623.77673919 3
2335.72146909 2
3359.81828628 1
4832.93023857 1
6951.92796178 1
10000.0 1


In [43]:
l1_penalty_max, l1_penalty_min

(263.66508987303581, 127.42749857031335)

In [46]:
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True) # set parameters
    model.fit(training[all_features], training['price']) # learn weights
    pred = model.predict(validation[all_features])
    n_features = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    print sum((pred-validation['price'])**2), l1_penalty, n_features

4.35374677103e+14 127.42749857 10
4.37009229124e+14 134.597898113 10
4.38236128387e+14 141.768297655 8
4.391589378e+14 148.938697197 8
4.40037365263e+14 156.109096739 7
4.40777489642e+14 163.279496282 7
4.4156669809e+14 170.449895824 7
4.42406413189e+14 177.620295366 7
4.43296716874e+14 184.790694908 7
4.44239780526e+14 191.961094451 7
4.45230739843e+14 199.131493993 7
4.46268896865e+14 206.301893535 6
4.47112919435e+14 213.472293077 6
4.47998187852e+14 220.64269262 6
4.48924706673e+14 227.813092162 6
4.498924759e+14 234.983491704 6
4.50901498778e+14 242.153891246 6
4.51952426655e+14 249.324290789 6
4.53043924368e+14 256.494690331 6
4.54176669663e+14 263.665089873 6


In [47]:
opt_l1_penalty = 156.109096739

In [48]:
model = linear_model.Lasso(alpha=opt_l1_penalty, normalize=True) # set parameters
model.fit(training[all_features], training['price']) # learn weights

Lasso(alpha=156.109096739, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [52]:
print model.intercept_
for value in (list(zip(model.coef_, training[all_features]))):
    print value

4422190.27912
(-0.0, 'bedrooms')
(-0.0, 'bedrooms_square')
(10610.890284463205, 'bathrooms')
(163.38025164760865, 'sqft_living')
(0.0, 'sqft_living_sqrt')
(-0.0, 'sqft_lot')
(-0.0, 'sqft_lot_sqrt')
(0.0, 'floors')
(0.0, 'floors_square')
(506451.68711510196, 'waterfront')
(41960.043554856777, 'view')
(0.0, 'condition')
(116253.55369973967, 'grade')
(0.0, 'sqft_above')
(0.0, 'sqft_basement')
(-2612.2348803597311, 'yr_built')
(0.0, 'yr_renovated')
