In [1]:
import pandas as pd
import numpy as np

In [25]:
sales = pd.read_csv('kc_house_data 2.csv')

In [26]:
from math import log, sqrt

## Simple feature engineering 

In [27]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

# In the dataset, 'floors' was defined with type string, 
# so we'll convert them to float, before creating a new feature.
sales['floors'] = sales['floors'].astype(float) 
sales['floors_square'] = sales['floors']*sales['floors']

In [28]:
sales.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqft_living_sqrt,sqft_lot_sqrt,bedrooms_square,floors_square
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340,5650,34.351128,75.166482,9,1.0
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690,7639,50.695167,85.099941,9,4.0
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720,8062,27.748874,100.0,4,1.0
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360,5000,44.271887,70.710678,16,1.0
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800,7503,40.987803,89.88882,9,1.0


## Learn regression weights with L1 penalty 

In [56]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [32]:
from sklearn import linear_model 
penalty = 1e10 #penalty too big 
model_all = linear_model.Lasso(alpha = 100000000)
model_all.fit(sales[all_features], sales['price'])

Lasso(alpha=100000000, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
 print (pd.Series(model_all.coef_,index=all_features))

bedrooms              0.000000
bedrooms_square       0.000000
bathrooms             0.000000
sqft_living         161.142560
sqft_living_sqrt      0.000000
sqft_lot              0.118888
sqft_lot_sqrt         0.000000
floors                0.000000
floors_square         0.000000
waterfront            0.000000
view                  0.000000
condition             0.000000
grade                 0.000000
sqft_above            0.000000
sqft_basement         0.000000
yr_built             -0.000000
yr_renovated          0.000000
dtype: float64


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [49]:
train_valid, test = train_test_split(sales, test_size=.1, random_state = 1)
training , validation = train_test_split(train_valid,test_size=.5 , random_state = 1)

In [73]:
validation.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqft_living_sqrt,sqft_lot_sqrt,bedrooms_square,floors_square
1512,5255690060,20150318T000000,413000.0,5,2.5,2900,8711,1.0,0,0,...,0,98011,47.7752,-122.197,2340,8869,53.851648,93.332738,25,1.0
8718,8029650040,20140519T000000,373000.0,3,2.5,1670,3565,2.0,0,0,...,0,98072,47.7623,-122.161,1510,3770,40.865633,59.707621,9,4.0
21430,2838000180,20150220T000000,700000.0,3,2.5,2230,4006,2.0,0,0,...,0,98133,47.73,-122.335,2230,4180,47.222876,63.29297,9,4.0
3181,2329810590,20140806T000000,285000.0,3,2.5,1940,9874,2.0,0,0,...,0,98042,47.3794,-122.113,1860,8875,44.045431,99.368003,9,4.0
4021,2473371780,20140924T000000,359950.0,5,2.25,2450,9432,2.0,0,0,...,0,98058,47.4519,-122.13,2310,9100,49.497475,97.118484,25,4.0


In [69]:
import numpy as np
import pprint 

validation_rss={}
for l1_penalty in np.logspace(1, 7, num=13):
    model = linear_model.Lasso(alpha = l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    prediction = model.predict(validation[all_features])
    residuals = validation['price'] - prediction
    rss = sum(residuals**2)
    validation_rss[l1_penalty] = rss
#     ret = [rss]
#     ret.extend([model.intercept_])
#     ret.extend(model.coef_)
#     print(ret) 


[416119312130937.94, 6321858.6162638394, -15949.528622527692, 404.28392348826179, 51782.196054080341, 548.98229807789153, -38956.357150676558, 0.37173712082637389, -478.83643808923892, -0.0, 7341.5012050050645, 449594.44896866375, 38178.599702421401, 25891.824832001286, 127794.40943459232, 0.0, 8.3955639718310717, -3200.796405360029, 12.491310357365588]
[426336112192818.19, 6021042.6099796165, -17599.287394410811, 155.10498674676472, 45376.776016495925, 352.45499596861561, -19142.591803944135, 0.0, -233.30433584762517, 0.0, 6577.7317348750485, 438144.00108264352, 40515.196382407252, 18377.841928199738, 123581.24360750279, 0.0, 2.3019911109983378, -3256.3592607581572, 5.854013394692573]
[459629571807217.75, 5007429.085606155, -7543.1647228553002, -0.0, 27592.640903786309, 156.49029411557879, -0.0, -0.0, -106.79158321578888, 0.0, 2391.6177920733048, 401681.77976130217, 42106.145772834323, 4811.2676824490272, 117837.65728450517, 0.0, 0.0, -2920.4137588430294, 0.0]
[509831182055762.31, 234

In [67]:
print (min(validation_rss.items(), key=lambda x: x[1]) )

(10.0, 416119312130937.94)


In [70]:
model_test = linear_model.Lasso(alpha = 10.0, normalize=True)
model_test.fit(training[all_features], training['price'])
prediction_test = model.predict(test[all_features])
residuals_test = test['price']-prediction_test
rss_test = sum(residuals_test**2)

In [71]:
print (rss_test)
print (prediction_test)

3.47133389363e+14
[ 538291.44236504  538291.44236504  538291.44236504 ...,  538291.44236504
  538291.44236504  538291.44236504]


In [72]:
print (pd.Series(model_test.coef_,index=all_features))

bedrooms            -15949.528623
bedrooms_square        404.283923
bathrooms            51782.196054
sqft_living            548.982298
sqft_living_sqrt    -38956.357151
sqft_lot                 0.371737
sqft_lot_sqrt         -478.836438
floors                  -0.000000
floors_square         7341.501205
waterfront          449594.448969
view                 38178.599702
condition            25891.824832
grade               127794.409435
sqft_above               0.000000
sqft_basement            8.395564
yr_built             -3200.796405
yr_renovated            12.491310
dtype: float64


## Limit the number of nonzero weights

In [74]:
max_nonzeros = 7

In [84]:
#possible l1 penalty values 
l1_penalty_values = np.logspace(2, 3, num=20)
print (l1_penalty_values)

[  100.           112.88378917   127.42749857   143.84498883   162.37767392
   183.29807108   206.91380811   233.57214691   263.66508987   297.63514416
   335.98182863   379.26901907   428.13323987   483.29302386   545.55947812
   615.84821107   695.19279618   784.75997035   885.86679041  1000.        ]


In [98]:
coef_dict= pd.Series()
for l1_penalty in l1_penalty_values:
    model_non = linear_model.Lasso(alpha = l1_penalty, normalize=True)
    model_non.fit(training[all_features], training['price'])
    coef_dict[l1_penalty]=model_non.coef_
#    print (pd.Series(model_non.coef_,index=all_features))

In [100]:
coef_dict[100.000000]

array([ -7.54316472e+03,  -0.00000000e+00,   2.75926409e+04,
         1.56490294e+02,  -0.00000000e+00,  -0.00000000e+00,
        -1.06791583e+02,   0.00000000e+00,   2.39161779e+03,
         4.01681780e+05,   4.21061458e+04,   4.81126768e+03,
         1.17837657e+05,   0.00000000e+00,   0.00000000e+00,
        -2.92041376e+03,   0.00000000e+00])