In [22]:
#import all necessary libraries, 
#here log, sqrt is used to create the new features

import numpy as np
import pandas as pd
from math import log, sqrt
import matplotlib.pyplot as plt
import sklearn.linear_model as skllm
%matplotlib inline

In [23]:
#import the original sales, train, test data sets

sales = pd.read_csv("kc_house_data.csv")
train_sales = pd.read_csv("kc_house_train_data.csv")
test_sales = pd.read_csv("kc_house_test_data.csv")

In [24]:
#import the train, valid, test segmented data sets

wk3_train_set = pd.read_csv("wk3_kc_house_train_data.csv")
wk3_valid_set = pd.read_csv("wk3_kc_house_valid_data.csv")
wk3_test_set = pd.read_csv("wk3_kc_house_test_data.csv")

In [25]:
def predict_outcome(features_matrix,weights):
  predictions = np.dot(features_matrix, weights)
  return (predictions)

create the new features

- squaring the number of bedrooms and floor space increases the distance between the too many and too few

- taking sqrt will in contrast decrease the separation

In [26]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']**2
sales['floors_square'] = sales['floors']**2

In [27]:
#build an initial list of all initial features of interest to select from

features_of_interest = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

#using linear_model.Lasso,learn weights 
#on all initial features using the sales
#set parameters, normalize = True to normalize 

model_all = skllm.Lasso(alpha=5e2, normalize=True)
model_all.fit(sales[features_of_interest], sales['price'])


Lasso(alpha=500.0, normalize=True)

In [28]:
#retrieve all weights
model_all.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

Quiz Question: Which of the following features have been chosen by LASSO, i.e. which features were assigned nonzero weights? (Choose all that apply)


In [29]:
#print out only the weights =/= 0 along with the corresponding feature name and index
for i in range(len(model_all.coef_)):
    if model_all.coef_[i] != 0:
        print('index:', i)
        print('feature selected:', features_of_interest[i])
        print(model_all.coef_[i])

index: 3
feature selected: sqft_living
134.43931395541438
index: 10
feature selected: view
24750.004585609488
index: 12
feature selected: grade
61749.10309070811


In [38]:
#repeat the new features creation 
#with test, train and validation sets

wk3_train_set['sqft_living_sqrt'] = wk3_train_set['sqft_living'].apply(sqrt)
wk3_train_set['sqft_lot_sqrt'] = wk3_train_set['sqft_lot'].apply(sqrt)
wk3_train_set['bedrooms_square'] = wk3_train_set['bedrooms']**2
wk3_train_set['floors_square'] = wk3_train_set['floors']**2

wk3_valid_set['sqft_living_sqrt'] = wk3_valid_set['sqft_living'].apply(sqrt)
wk3_valid_set['sqft_lot_sqrt'] = wk3_valid_set['sqft_lot'].apply(sqrt)
wk3_valid_set['bedrooms_square'] = wk3_valid_set['bedrooms']**2
wk3_valid_set['floors_square'] = wk3_valid_set['floors']**2

wk3_test_set['sqft_living_sqrt'] = wk3_test_set['sqft_living'].apply(sqrt)
wk3_test_set['sqft_lot_sqrt'] = wk3_test_set['sqft_lot'].apply(sqrt)
wk3_test_set['bedrooms_square'] = wk3_test_set['bedrooms']**2
wk3_test_set['floors_square'] = wk3_test_set['floors']**2

In [39]:
#write a for loop that takes in different values of l1_penalty
#and runs a lasso regression model each time

RSS = []
l1_penalty_iter = []
for l1_penalty in np.logspace(1,7, num=13): 
    l1_penalty_iter.append(l1_penalty)
    model_train = skllm.Lasso(alpha = l1_penalty, normalize=True)
    model_train.fit(wk3_train_set[features_of_interest], wk3_train_set['price'])
    weights = model_train.coef_
    #print('model for l1_penalty: ', l1_penalty)
    #print(weights)
    predictions = model_train.predict(wk3_valid_set[features_of_interest])
    residuals = wk3_valid_set['price'] - predictions
    rss = (residuals**2).sum()
    RSS.append(rss)
    #print(rss)
    #print('           ')
    l1_penalty_RSS = dict(zip(l1_penalty_iter, RSS))

print('l1_penalty with lowest RSS:', [f for f,ftype in l1_penalty_RSS.items() if ftype == min(RSS)])
print(min(RSS))

l1_penalty with lowest RSS: [10.0]
398213327300134.94


here we train the model on the train set to extract the learned weights
then estimate predicted values using test data and finally calculate the RSS 

In [32]:
model_train = skllm.Lasso(alpha = 10.0, normalize=True)
model_train.fit(wk3_train_set[features_of_interest], wk3_train_set['price'])
print('number of non-zero coefficients:',np.count_nonzero(model_train.coef_) + np.count_nonzero(model_train.intercept_))

predictions = model_train.predict(wk3_test_set[features_of_interest])
residuals = wk3_test_set['price'] - predictions
rss_test = (residuals**2).sum()
print('RSS:', rss_test)

number of non-zero coefficients: 15
RSS: 98467402552698.81


In [33]:
#for every value of the l1_penalty, 
#how many non-zero coefficients are there?

l1_penalty_num_nonzeros = []
l1_penalty_list = []
for l1_penalty in np.logspace(1,4, num=20):
    l1_penalty_list.append(l1_penalty)
    model_train = skllm.Lasso(alpha = l1_penalty, normalize=True)
    model_train.fit(wk3_train_set[features_of_interest], wk3_train_set['price'])
    #print(np.count_nonzero(model_train.coef_) + np.count_nonzero(model_train.intercept_))
    l1_penalty_num_nonzeros.append(np.count_nonzero(model_train.coef_) + np.count_nonzero(model_train.intercept_))

max_nonzeros = 7
#print(l1_penalty_list)
#print('            ')



after specifying the max no. of non-zero coefficients we will tolerate

we look for the boundaries (upper and lower bounds) of the l2 penalty, 
where the upper bound is the largest penalty value for which the num of non zeroes stays under the tolerance

-any higher than the upper bound and the num of non zeroes will definitely decrease

and where the lower bound is the smallest penalty for which the num of non zeroesstays above the tolerance 

-any lower than the lower bound and the num of non zeroes will  definitely increase 


In [34]:

l1_penalty_min = []
l1_penalty_max = []

for x in l1_penalty_num_nonzeros:
    if (x > max_nonzeros):
        l1_penalty_max.append(l1_penalty_list[x])
    else: 
        l1_penalty_min.append(l1_penalty_list[x]) 
upper_bound = min(l1_penalty_max)
lower_bound = max(l1_penalty_min)
print('Upper Bound is {} and Lower Bound is {} '.format(upper_bound, lower_bound))

Upper Bound is 379.26901907322497 and Lower Bound is 127.42749857031335 


using the range of l1_penalties found we will estimate which l1_penalty value has the lowest RSS on test data

In [35]:
RSS = []
l1_penalty_narrow_range = []

for l1_penalty in np.linspace(upper_bound, lower_bound, num=20):
    l1_penalty_narrow_range.append(l1_penalty)
    model_train = skllm.Lasso(alpha = l1_penalty, normalize=True)
    model_train.fit(wk3_train_set[features_of_interest], wk3_train_set['price'])
    weights = model_train.coef_
    #print('model for l1_penalty: ', l1_penalty)
    #print(weights)
    num_nonzeros = np.count_nonzero(model_train.coef_) + np.count_nonzero(model_train.intercept_)
    if num_nonzeros == 7: 
        predictions = model_train.predict(wk3_valid_set[features_of_interest])
        residuals = wk3_valid_set['price'] - predictions
        rss = (residuals**2).sum()
        RSS.append(rss)
        #print('           ')
    print('model for l1_penalty: ', l1_penalty)
    print('number of non-zero coefficients:', num_nonzeros)
    print('RSS: ', rss)
    print('      ')

model for l1_penalty:  379.26901907322497
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  366.01420220465064
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  352.7593853360764
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  339.50456846750205
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  326.2497515989278
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  312.99493473035346
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  299.7401178617792
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  286.4853009932049
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  273.2304841246306
number of non-zero coefficients: 6
RSS:  1222506859427163.0
      
model for l1_penalty:  2

fit the model once again using the l1_penalty for which the RSS is minimized and where num of non-zero coefficients = max_nonzeroes

In [37]:
model_train = skllm.Lasso(alpha = 193.7015829131848, normalize=True)
model_train.fit(wk3_train_set[features_of_interest], wk3_train_set['price'])

for i in range(len(model_train.coef_)):
    if(model_train.coef_[i] != 0):
        print(features_of_interest[i])

bathrooms
sqft_living
waterfront
view
grade
yr_built
