In [18]:
import pandas as pd 
import sklearn.linear_model
from math import log, sqrt
import numpy as np
import sys

In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('../kc_house_data.csv', dtype=dtype_dict)

In [5]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [6]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
model_all.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

In [10]:
testing = pd.read_csv('../wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('../wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('../wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [11]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [21]:
minRSS = sys.maxsize
bestL1 = 0

for i in np.logspace(1, 7, num=13):
    model = linear_model.Lasso(alpha = i, normalize=True)
    model.fit(training[all_features], training['price'])
    
    RSS = ((model.predict(validation[all_features]) - validation.price) ** 2).sum()
    print ("RSS: ", RSS, " for L1: ", i)
    
    if RSS < minRSS:
        minRSS = RSS
        bestL1 = i

print ('\nMinimum RSS:', minRSS, ' for L1: ', bestL1)

RSS:  398213327300134.9  for L1:  10.0
RSS:  399041900253346.9  for L1:  31.622776601683793
RSS:  429791604072559.6  for L1:  100.0
RSS:  463739831045121.1  for L1:  316.22776601683796
RSS:  645898733633800.6  for L1:  1000.0
RSS:  1222506859427163.0  for L1:  3162.2776601683795
RSS:  1222506859427163.0  for L1:  10000.0
RSS:  1222506859427163.0  for L1:  31622.776601683792
RSS:  1222506859427163.0  for L1:  100000.0
RSS:  1222506859427163.0  for L1:  316227.7660168379
RSS:  1222506859427163.0  for L1:  1000000.0
RSS:  1222506859427163.0  for L1:  3162277.6601683795
RSS:  1222506859427163.0  for L1:  10000000.0

Minimum RSS: 398213327300134.9  for L1:  10.0


In [22]:
model = linear_model.Lasso(alpha=bestL1, normalize=True)
model.fit(training[all_features], training['price'])

Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [23]:
print (model.coef_)

[-1.61445628e+04  3.73245384e+02  5.08412433e+04  6.17853560e+02
 -4.44113549e+04  7.85623065e-01 -7.01194765e+02 -0.00000000e+00
  5.01420046e+03  6.19488752e+05  3.80418557e+04  2.49987718e+04
  1.28716235e+05  0.00000000e+00  0.00000000e+00 -3.29383118e+03
  1.00573209e+01]


In [24]:
print (model.intercept_)

6630155.66862836


In [25]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

In [26]:
max_nonzeros = 7
L1_range = np.logspace(1, 4, num=20)
print (L1_range)

[   10.            14.38449888    20.69138081    29.76351442
    42.81332399    61.58482111    88.58667904   127.42749857
   183.29807108   263.66508987   379.26901907   545.55947812
   784.75997035  1128.83789168  1623.77673919  2335.72146909
  3359.81828628  4832.93023857  6951.92796178 10000.        ]


In [27]:
l1_penalty_min = L1_range[0]
l1_penalty_max = L1_range[19]

for L1 in L1_range:
    model = linear_model.Lasso(alpha=L1, normalize=True)
    model.fit(training[all_features], training['price']) 
    
    nonZeroes = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    
    if (nonZeroes > max_nonzeros) and (L1 > l1_penalty_min) :
        l1_penalty_min = L1
    
    
    if (nonZeroes < max_nonzeros) and (L1 < l1_penalty_max):
        l1_penalty_max = L1
    
print ('l1_penalty_min: ', l1_penalty_min)
print ('l1_penalty_max: ', l1_penalty_max)

l1_penalty_min:  127.42749857031335
l1_penalty_max:  263.6650898730358


In [28]:
L1_narrow_interval = np.linspace(l1_penalty_min,l1_penalty_max,20)
print (L1_narrow_interval)

[127.42749857 134.59789811 141.76829765 148.9386972  156.10909674
 163.27949628 170.44989582 177.62029537 184.79069491 191.96109445
 199.13149399 206.30189354 213.47229308 220.64269262 227.81309216
 234.9834917  242.15389125 249.32429079 256.49469033 263.66508987]


In [29]:
minRSS = sys.maxsize
bestL1 = 0

for L1 in L1_narrow_interval:
    model = linear_model.Lasso(alpha=L1, normalize=True)
    model.fit(validation[all_features], validation['price'])
   
    RSS = ((model.predict(validation[all_features]) - validation.price) ** 2).sum()
    print ("RSS: ", RSS, " for L1: ", L1)
     
    if RSS < minRSS:
        minRSS = RSS
        bestL1 = L1
        
print ('\nMinimum RSS:', minRSS, ' for L1: ', bestL1)

RSS:  439071925352512.9  for L1:  127.42749857031335
RSS:  441128710027315.1  for L1:  134.5978981125619
RSS:  442825187166451.4  for L1:  141.76829765481045
RSS:  443877431690778.6  for L1:  148.938697197059
RSS:  444981125900466.7  for L1:  156.10909673930755
RSS:  446136908849758.06  for L1:  163.2794962815561
RSS:  447344709605348.5  for L1:  170.44989582380464
RSS:  448566985859169.9  for L1:  177.6202953660532
RSS:  449776911889217.56  for L1:  184.79069490830176
RSS:  450854996344142.1  for L1:  191.96109445055032
RSS:  451974123723515.1  for L1:  199.13149399279888
RSS:  453134295270741.0  for L1:  206.3018935350474
RSS:  454335490310594.25  for L1:  213.47229307729594
RSS:  455577721405391.25  for L1:  220.6426926195445
RSS:  456860991130727.4  for L1:  227.81309216179307
RSS:  458185301192966.94  for L1:  234.98349170404163
RSS:  459550648291129.0  for L1:  242.1538912462902
RSS:  460956444625907.9  for L1:  249.32429078853872
RSS:  462403851116601.44  for L1:  256.4946903307

In [30]:
model = linear_model.Lasso(alpha=bestL1, normalize=True)
model.fit(validation[all_features], validation['price']) 

print (all_features)
print ()
print (model.coef_)
print ()
print (model.intercept_)

['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

[-3.29055642e+03 -0.00000000e+00  1.18432899e+04  1.44136981e+02
  0.00000000e+00 -0.00000000e+00 -2.74611191e+01  0.00000000e+00
  0.00000000e+00  4.70965287e+05  4.93452902e+04  0.00000000e+00
  1.20591515e+05  0.00000000e+00  0.00000000e+00 -2.50635813e+03
  4.18993990e+00]

4230438.4260399155
