In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')
from sklearn.linear_model import Lasso

In [110]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

house_data = pd.read_csv('kc_house_data.csv',dtype=dtype_dict)
print(house_data.shape)

(21613, 21)


In [111]:
house_data['sqft_living_sqrt'] = np.sqrt(house_data['sqft_living'])
house_data['sqft_lot_sqrt'] = np.sqrt(house_data['sqft_lot'])
house_data['bedrooms_square'] = np.square(house_data['bedrooms'])
house_data['floors_square'] = np.square(house_data['floors'])
house_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqft_living_sqrt,sqft_lot_sqrt,bedrooms_square,floors_square
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,34.351128,75.166482,9.0,1.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,50.695167,85.099941,9.0,4.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,27.748874,100.0,4.0,1.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,44.271887,70.710678,16.0,1.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,40.987803,89.88882,9.0,1.0


In [112]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

# Applying Lasso Regression

In [113]:
model = Lasso(alpha=5e2,normalize = True)

In [114]:
X = np.array(house_data[all_features])
Y = np.array(house_data['price']).reshape(-1,1)
print(X.shape,Y.shape)
model.fit(X,Y)

(21613, 17) (21613, 1)


Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [115]:
model.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

# Choosing good L1 penalty(alpha) value

In [116]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [117]:
testing['sqft_living_sqrt'] = np.sqrt(testing['sqft_living'])
testing['sqft_lot_sqrt'] = np.sqrt(testing['sqft_lot'])
testing['bedrooms_square'] = np.square(testing['bedrooms'])
testing['floors_square'] = np.square(testing['floors'])

training['sqft_living_sqrt'] = np.sqrt(training['sqft_living'])
training['sqft_lot_sqrt'] = np.sqrt(training['sqft_lot'])
training['bedrooms_square'] = np.square(training['bedrooms'])
training['floors_square'] = np.square(training['floors'])

validation['sqft_living_sqrt'] = np.sqrt(validation['sqft_living'])
validation['sqft_lot_sqrt'] = np.sqrt(validation['sqft_lot'])
validation['bedrooms_square'] = np.square(validation['bedrooms'])
validation['floors_square'] = np.square(validation['floors'])

In [87]:
l1_penalty = np.logspace(1,7,num=13)

In [88]:
l1_penalty

array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07])

In [138]:
rss = {}
non_zeros = {}
for i in np.logspace(1,7,num=13):
    model = Lasso(alpha=i, normalize=True)
    X_train = np.array(training[all_features])
    Y_train = np.array(training['price']).reshape(-1,1)
    X_val = np.array(validation[all_features])
    Y_val = np.array(validation['price']).reshape(-1,1)
    
    model.fit(X_train,Y_train)
    non_zeros[str(i)] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    rss[str(i)] = np.sum(np.square(( model.predict(X_val) - Y_val  )))

In [139]:
rss

{'10.0': 2.066303599497208e+19,
 '31.622776601683793': 2.0106031260560675e+19,
 '100.0': 1.9298508281926636e+19,
 '316.22776601683796': 1.7639721942371875e+19,
 '1000.0': 1.4481232585702595e+19,
 '3162.2776601683795': 1.1778853590580718e+19,
 '10000.0': 1.1778853590580718e+19,
 '31622.776601683792': 1.1778853590580718e+19,
 '100000.0': 1.1778853590580718e+19,
 '316227.7660168379': 1.1778853590580718e+19,
 '1000000.0': 1.1778853590580718e+19,
 '3162277.6601683795': 1.1778853590580718e+19,
 '10000000.0': 1.1778853590580718e+19}

In [140]:
non_zeros

{'10.0': 15,
 '31.622776601683793': 15,
 '100.0': 11,
 '316.22776601683796': 6,
 '1000.0': 4,
 '3162.2776601683795': 1,
 '10000.0': 1,
 '31622.776601683792': 1,
 '100000.0': 1,
 '316227.7660168379': 1,
 '1000000.0': 1,
 '3162277.6601683795': 1,
 '10000000.0': 1}

In [142]:
model.predict(X_val)

array([542734.9516443, 542734.9516443, 542734.9516443, ...,
       542734.9516443, 542734.9516443, 542734.9516443])

In [144]:
Y_val

array([[221900.],
       [538000.],
       [180000.],
       ...,
       [360000.],
       [400000.],
       [400000.]])

In [141]:
np.min(list(rss.values()))

1.1778853590580718e+19

# Choosing model with limited no of features

In [33]:
max_nonzero = 7

In [45]:
non_zeros = {}
for i in np.logspace(1, 4, num=20):
    model = Lasso(alpha=i, normalize=True)
    X_train = np.array(training[all_features])
    Y_train = np.array(training['price']).reshape(-1,1)

    model.fit(X_train,Y_train)
    count = 0
    if(model.intercept_ != 0):
        count += 1
    count += np.count_nonzero(model.coef_)
    non_zeros[str(i)] = count
    

In [46]:
non_zeros

{'10.0': 15,
 '14.38449888287663': 15,
 '20.6913808111479': 15,
 '29.76351441631318': 15,
 '42.81332398719393': 13,
 '61.58482110660264': 12,
 '88.58667904100822': 11,
 '127.42749857031335': 10,
 '183.29807108324357': 7,
 '263.6650898730358': 6,
 '379.26901907322497': 6,
 '545.5594781168514': 6,
 '784.7599703514607': 5,
 '1128.8378916846884': 3,
 '1623.776739188721': 3,
 '2335.7214690901214': 2,
 '3359.818286283781': 1,
 '4832.930238571752': 1,
 '6951.927961775606': 1,
 '10000.0': 1}

In [47]:
l1_penalty_min = 127.42749857031335

In [48]:
l1_penalty_max = 263.6650898730358

In [54]:

rss = {}
non_zeros = {}
for i in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = Lasso(alpha=i, normalize=True)
    X_train = np.array(training[all_features])
    Y_train = np.array(training['price']).reshape(-1,1)
    X_val = np.array(validation[all_features])
    Y_val = np.array(validation['price']).reshape(-1,1)
    
    model.fit(X_train,Y_train)
    count = 0
    if(model.intercept_ != 0):
        count += 1
    count += np.count_nonzero(model.coef_)
    non_zeros[str(i)] = count
    rss[str(i)] = np.sum(np.square(Y_val - model.predict(X_val)))
    
    

In [55]:
non_zeros

{'127.42749857031335': 10,
 '134.5978981125619': 10,
 '141.76829765481045': 8,
 '148.938697197059': 8,
 '156.10909673930755': 7,
 '163.2794962815561': 7,
 '170.44989582380464': 7,
 '177.6202953660532': 7,
 '184.79069490830176': 7,
 '191.96109445055032': 7,
 '199.13149399279888': 7,
 '206.3018935350474': 6,
 '213.47229307729594': 6,
 '220.6426926195445': 6,
 '227.81309216179307': 6,
 '234.98349170404163': 6,
 '242.1538912462902': 6,
 '249.32429078853872': 6,
 '256.49469033078725': 6,
 '263.6650898730358': 6}

In [56]:
rss

{'127.42749857031335': 1.9062915849580188e+19,
 '134.5978981125619': 1.9003350142621778e+19,
 '141.76829765481045': 1.894402443547697e+19,
 '148.938697197059': 1.8884840610155766e+19,
 '156.10909673930755': 1.8826306254270337e+19,
 '163.2794962815561': 1.8768284170526065e+19,
 '170.44989582380464': 1.8710769218261656e+19,
 '177.6202953660532': 1.865374149010483e+19,
 '184.79069490830176': 1.859720014731415e+19,
 '191.96109445055032': 1.8541125856423203e+19,
 '199.13149399279888': 1.8485564115784245e+19,
 '206.3018935350474': 1.8430705449263784e+19,
 '213.47229307729594': 1.8376255341417996e+19,
 '220.6426926195445': 1.8322202053106878e+19,
 '227.81309216179307': 1.826854621244579e+19,
 '234.98349170404163': 1.8215287819434514e+19,
 '242.1538912462902': 1.816242739384397e+19,
 '249.32429078853872': 1.8110072709701638e+19,
 '256.49469033078725': 1.8058006604087265e+19,
 '263.6650898730358': 1.8006337459331027e+19}

In [58]:
np.min(list(rss.values())[4:11])

1.8485564115784245e+19

In [59]:
model = Lasso(alpha=199.13149399279888, normalize=True)
X_train = np.array(training[all_features])
Y_train = np.array(training['price']).reshape(-1,1)

model.fit(X_train,Y_train)
print(model.coef_ ,model.intercept_)

[-0.00000000e+00 -0.00000000e+00  1.49058857e+03  1.66278312e+02
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  4.72160342e+05  4.12319841e+04  0.00000000e+00
  1.11748709e+05  0.00000000e+00  0.00000000e+00 -2.29831279e+03
  0.00000000e+00] [3851757.14100311]


In [60]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

# Questions
3. Quiz Question: Which features have been chosen by LASSO, i.e. which features were assigned nonzero weights?
##### ANS: sqft_living, waterfront,condition
6. Quiz Question: Which was the best value for the l1_penalty, i.e. which value of l1_penalty produced the lowest RSS on VALIDATION data?
##### ANS:  3.16227766e+03
8. Quiz Question: Using the best L1 penalty, how many nonzero weights do you have? Count the number of nonzero coefficients first, and add 1 if the intercept is also nonzero.
##### ANS: 1
13. Quiz Question: What values did you find for l1_penalty_min and l1_penalty_max?
##### ANS: 127.42749857031335 , 263.6650898730358
15. Quiz Question: What value of l1_penalty in our narrow range has the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’?
##### ANS: 199.13149399279888
16. Quiz Question: What features in this model have non-zero coefficients?
##### ANS: bathrooms', 'sqft_living', 'waterfront', 'view', 'grade', yr_built

In [61]:
np.round(263.6650898730358,0)

264.0

8