In [1]:
import graphlab

sales = graphlab.SFrame('kc_house_data.gl/')
# In the dataset, 'floors' was defined with type string, 
# so we'll convert them to int, before using it below
sales['floors'] = sales['floors'].astype(int) 

import numpy as np # note this allows us to refer to numpy as np instead 

def get_numpy_data(data_sframe, features, output):
    
    data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_sframe = data_sframe[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = data_sframe[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix , weights)
    return(predictions)


def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    norm_feat = feature_matrix/norms
    return (norm_feat, norms)



This non-commercial license of GraphLab Create for academic use is assigned to karchit@student.nitw.ac.in and will expire on October 06, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1478138902.log


In [2]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = (feature_matrix[:,i]*(output - prediction + feature_matrix[:,i]*weights[i] )).sum()
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2.
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2.
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [3]:
# should print 0.425558846691
import math
print lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],[2./math.sqrt(13),3./math.sqrt(10)]]), 
                                   np.array([1., 1.]), np.array([1., 4.]), 0.1)

0.425558846691


In [4]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    flag=1
    weights = initial_weights
    while flag==1:
        flag=0
        for i in range(len(weights)):
            old_weights_i = weights[i]   
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            change = abs(old_weights_i - weights[i])
            if change > tolerance:
                flag=1
    return weights

In [5]:
train_data,test_data = sales.random_split(.8,seed=0)

my_output = 'price'
initial_weights = np.zeros(14)
l1_penalty = 1e7
tolerance = 1.0

all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

(feature_matrix, output) = get_numpy_data(train_data, all_features, my_output)
(normalized_feature_matrix, norms) = normalize_features(feature_matrix) # normalize features

# (test_feature_matrix, test_output) = get_numpy_data(test_data, all_features, my_output)
# test_normalised_features = test_feature_matrix/norms # normalize features

In [6]:
weights1e7 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)


In [8]:
print weights1e7

[ 24429600.60933314         0.                 0.          48389174.35227978
         0.                 0.           3317511.16271982   7329961.9848964
         0.                 0.                 0.                 0.
         0.                 0.        ]


In [11]:
my_output = 'price'
initial_weights = np.zeros(14)
l1_penalty = 1e8
tolerance = 1.0

weights1e8 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)


In [12]:
print weights1e8

[ 71114625.75280938         0.                 0.                 0.
         0.                 0.                 0.                 0.
         0.                 0.                 0.                 0.
         0.                 0.        ]


In [15]:
my_output = 'price'
initial_weights = np.zeros(14)
l1_penalty = 1e4
tolerance = 5e5


In [16]:

weights1e4 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)


In [17]:
print weights1e4

[ 77779073.91265225 -22884012.25023361  15348487.08089996
  92166869.69883074  -2139328.0824278   -8818455.54409492
   6494209.73310655   7065162.05053198   4119079.21006765
  18436483.52618776 -14566678.54514342  -5528348.75179426
 -83591746.20730537   2784276.46012858]


In [19]:
normalized_weights1e7 = weights1e7/norms
normalized_weights1e8 = weights1e8/norms
normalized_weights1e4 = weights1e4/norms
print normalized_weights1e7[3]
# should return 161.31745624837794.

161.317456248


In [20]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, all_features, 'price')

In [21]:
pred = predict_output(test_feature_matrix, normalized_weights1e4)
err = ((pred-test_output)*(pred-test_output)).sum()
print err

2.2778100476e+14


In [22]:
pred = predict_output(test_feature_matrix, normalized_weights1e7)
err = ((pred-test_output)*(pred-test_output)).sum()
print err

2.75962079909e+14


In [23]:
pred = predict_output(test_feature_matrix, normalized_weights1e8)
err = ((pred-test_output)*(pred-test_output)).sum()
print err

5.37166150034e+14
