# KC housing price prediction

## Lasso Regression

In [1]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)


### Preprocessing

In [2]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

### Training lasso model

In [3]:
from sklearn import linear_model

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

#### Features selected by Lasso

In [9]:
import numpy as np
print(np.array(all_features)[model_all.coef_ !=0])

['sqft_living' 'view' 'grade']


### Load train, test, val set

In [14]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

### Preprocess

In [15]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

##### best value for the l1_penalty

In [16]:
penalties = np.logspace(1, 7, num=13)
RSS = []

for l1 in penalties:
    model = linear_model.Lasso(alpha=l1, normalize=True)
    model.fit(training[all_features], training['price'])
    rss = np.sum((validation['price'] - model.predict(validation[all_features]))**2)
    RSS.append(rss)

best_l1 = penalties[np.argmin(RSS)]
best_l1

10.0

#### RSS on TEST data 

In [19]:
rss = np.sum((testing['price'] - model.predict(testing[all_features]))**2)
rss

284718925209874.0

### Using the best L1 penalty, how many nonzero weights do you have? Count the number of nonzero coefficients first, and add 1 if the intercept is also nonzero.

In [20]:
model = linear_model.Lasso(alpha=best_l1, normalize=True)
model.fit(training[all_features], training['price'])
np.count_nonzero(np.append(model.intercept_, model.coef_))

15

In [22]:
max_nonzeros = 7

l1range = np.logspace(1, 4, num=20)
results = []

for l1 in l1range:
    model = linear_model.Lasso(alpha=l1, normalize=True)
    model.fit(training[all_features], training['price'])
    results.append(np.count_nonzero(np.append(model.intercept_, model.coef_)))

In [23]:
results

[15, 15, 15, 15, 13, 12, 11, 10, 7, 6, 6, 6, 5, 3, 3, 2, 1, 1, 1, 1]

In [24]:
l1_penalty_min = 127.42749857031335
l1_penalty_max = 263.6650898730358

In [25]:
finalrange = np.linspace(l1_penalty_min,l1_penalty_max,20)


model_RSS=[]

for l1_penalty in finalrange:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    results.append(np.count_nonzero(np.append(model.intercept_, model.coef_)))
    rss = np.sum((validation['price'] - model.predict(validation[all_features]))**2)
    nonzeros = np.count_nonzero(np.append(model.intercept_, model.coef_))
    model_RSS.append((rss, nonzeros, model))

In [26]:
minRSS = 100*model_RSS[0][0]
minRSS

4.353746771026806e+16

In [27]:
i=0

idx=0
for rss, nonzeros, model in model_RSS:
    
    if (nonzeros == max_nonzeros) and rss < minRSS:
        minRSS = rss
        idx = i
    i=i+1
print(finalrange[idx])  # 156.10909673930755

156.10909673930755


In [28]:
np.array(all_features)[model_RSS[4][2].coef_ != 0]

array(['bathrooms', 'sqft_living', 'waterfront', 'view', 'grade',
       'yr_built'], dtype='<U16')