In [1]:
import math
import random
import numpy as np
import pandas as pd
from sklearn import  linear_model
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold
from sklearn.linear_model import RidgeCV

In [2]:
#Load the sales dataset using Pandas

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
#Create new features by performing following transformation on inputs
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']
#    Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) 
#and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly 
#affect houses with many bedrooms.
#On the other hand, taking square root of sqft_living will decrease the separation between big house and small house.
#The owner may not be exactly twice as happy for getting a house that is twice as big.

In [4]:
#Using the entire house dataset, learn regression weights using an L1 penalty(alpha) of 5e2
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [5]:
#Create a function that prints the polynomial coefficients in a pretty way :)
def print_coefficients(model):    
    # Get the degree of the polynomial
    #print model.coef_
    deg = len(model.coef_)
    
    # Get learned parameters as a list
    w = [model.intercept_] + list(model.coef_) 

    # Numpy has a nifty function to print out polynomials in a pretty way
    # (We'll use it, but it needs the parameters in the reverse order)
    print 'Learned polynomial for degree ' + str(deg) + ':'
    w.reverse()
    print np.poly1d(w)

In [46]:
w = [model_all.intercept_] + list(model_all.coef_)

   
    
alpha=5e2    
non_zeros_number = len(np.nonzero(w)[0])
    
    
   
    
    

print 'number of nonzeros = %r' % non_zeros_number 
print_coefficients(model_all)
print '\n'


number of nonzeros = 4
Learned polynomial for degree 17:
           13             11         4
6.175e+04 x  + 2.475e+04 x  + 134.4 x - 2.181e+05




In [47]:
#To find a good L1 penalty(alpha),  explore multiple values using a validation set. Using three way split into train, validation, and test sets.
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [48]:
#create 4 features as above
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [184]:
def RSS_calculation(features, data, output):
    prediction =  model.predict(data[features])
    
    RSS = ((prediction - output)**2).sum()
    
    return RSS



In [117]:
lowest_RSS = 0
l1_penalty_values = np.logspace(1, 7, num=13)
for l1_penalty in l1_penalty_values:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model = model.fit(training[all_features], training['price'])
    RSS = RSS_calculation(all_features, validation,validation['price'] )
    if lowest_RSS>0  and RSS < lowest_RSS:
        lowest_RSS = RSS
        best_l1_penalty = l1_penalty
    elif lowest_RSS==0:
        lowest_RSS= RSS
        best_l1_penalty = l1_penalty
    print "l1_penalty: " + str(l1_penalty) + ", RSS: " + str(RSS)    
print "l1_penalty: " + str(best_l1_penalty) + ", lowest RSS: " + str(lowest_RSS) 

l1_penalty: 10.0, RSS: 3.982133273e+14
l1_penalty: 31.6227766017, RSS: 3.99041900253e+14
l1_penalty: 100.0, RSS: 4.29791604073e+14
l1_penalty: 316.227766017, RSS: 4.63739831045e+14
l1_penalty: 1000.0, RSS: 6.45898733634e+14
l1_penalty: 3162.27766017, RSS: 1.22250685943e+15
l1_penalty: 10000.0, RSS: 1.22250685943e+15
l1_penalty: 31622.7766017, RSS: 1.22250685943e+15
l1_penalty: 100000.0, RSS: 1.22250685943e+15
l1_penalty: 316227.766017, RSS: 1.22250685943e+15
l1_penalty: 1000000.0, RSS: 1.22250685943e+15
l1_penalty: 3162277.66017, RSS: 1.22250685943e+15
l1_penalty: 10000000.0, RSS: 1.22250685943e+15
l1_penalty: 10.0, lowest RSS: 3.982133273e+14


In [90]:
l1_penalty = 10.0
model = linear_model.Lasso(alpha=10.0, normalize=True)
model = model.fit(training[all_features], training['price'])
w = [model.intercept_] + list(model.coef_)

   
    
alpha = 10.0   
non_zeros_number = len(np.nonzero(w)[0])
    
    
   
    
    

print 'number of nonzeros = %r' % non_zeros_number 
print_coefficients(model)
print '\n'

number of nonzeros = 15
Learned polynomial for degree 17:
       17        16             13           12             11
10.06 x  - 3294 x  + 1.287e+05 x  + 2.5e+04 x  + 3.804e+04 x 
              10        9     8         7          6             5
 + 6.195e+05 x  + 5014 x - 0 x - 701.2 x + 0.7856 x - 4.441e+04 x
          4             3         2
 + 617.9 x + 5.084e+04 x + 373.2 x - 1.614e+04 x + 6.63e+06




In [190]:
l1_penalty_min=0
l1_penalty_max=0
max_nonzeros = 7
min_nonzeros = 7
max_nonzeros_features = []
l1_penalty_values = np.logspace(1, 4, num=20)
for l1_penalty in l1_penalty_values:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model = model.fit(training[all_features], training['price'])
    w = [model.intercept_] + list(model.coef_)
    s = list(model.coef_)
        
    non_zeros_number = len(np.nonzero(w)[0])
    if non_zeros_number > max_nonzeros: 
        l1_penalty_max = l1_penalty
        max_nonzeros = non_zeros_number
        
        max_nonzeros_list = np.nonzero(s)[0] 
        
        for value in max_nonzeros_list:
            
            max_nonzeros_features.append(all_features[value])
         
    if  non_zeros_number < min_nonzeros:
        l1_penalty_min = l1_penalty
        min_nonzeros = non_zeros_number
        
        
print "l1_penalty_max = %.0f" %  l1_penalty_max 
print '\n'
print "l1_penalty_min = %.0f" %  l1_penalty_min    
print "max_nonzeros = %.0f" %  max_nonzeros 
print "max_nonzeros_features = %s" %  max_nonzeros_features

l1_penalty_max = 10


l1_penalty_min = 3360
max_nonzeros = 15
max_nonzeros_features = ['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 'sqft_lot', 'sqft_lot_sqrt', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated']


In [186]:
max_nonzeros = 15
nonzeros = 7
lowest_RSS = 0
max_nonzeros_features = ['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt', 
                         'sqft_lot', 'sqft_lot_sqrt', 'floors_square', 'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated']
def find_best_penalty(l1_penalty_min, l1_penalty_max):
    max_nonzeros_features_new= []
    lowest_RSS = 0
    
    for l1_penalty in range(int(l1_penalty_max), int(l1_penalty_min) + 1):
        
        model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
       
        model = model.fit(training[max_nonzeros_features], training['price'])
        
        w = [model.intercept_] + list(model.coef_)

   
    
   
        non_zeros_number = len(np.nonzero(w)[0])
        
        if non_zeros_number == nonzeros:
            s = list(model.coef_) 
            
            max_nonzeros_list = np.nonzero(s)[0]
            
            for value in max_nonzeros_list:
            
                max_nonzeros_features_new.append(all_features[value]) 
           
            model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
       
            model = model.fit(training[max_nonzeros_features_new], training['price'])
            prediction =  model.predict(validation[max_nonzeros_features_new])
    
            RSS = ((prediction - validation['price'])**2).sum()
    
            
           
            if lowest_RSS>0  and RSS < lowest_RSS:
                lowest_RSS = RSS
                best_l1_penalty_new = l1_penalty
            elif lowest_RSS==0:
                lowest_RSS= RSS
                best_l1_penalty_new = l1_penalty
    return best_l1_penalty_new
    #print "best_l1_penalty = %.0f" %  best_l1_penalty_new      
print find_best_penalty(3360, 10)    

154


In [188]:
l1_penalty = 154.0
model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
model = model.fit(training[all_features], training['price'])
w = [model.intercept_] + list(model.coef_)

   
    
  
non_zeros_number = len(np.nonzero(w)[0])
    
    
   
    
    

print 'number of nonzeros = %r' % non_zeros_number 
print_coefficients(model)
print '\n'

number of nonzeros = 7
Learned polynomial for degree 17:
       16             13           11             10     7     6
-2628 x  + 1.165e+05 x  + 4.2e+04 x  + 5.081e+05 x  - 0 x - 0 x
          4             3     2
 + 163.2 x + 1.105e+04 x - 0 x - 0 x + 4.45e+06


