In [1]:
import pandas as pd
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

kc_data = pd.read_csv('c://users/intel/Desktop/Coursera/Regression/week5/2_kc_house_data.csv', dtype=dtype_dict)
training = pd.read_csv('c://users/intel/Desktop/Coursera/Regression/week5/2_kc_house_train_data.csv', dtype=dtype_dict)
testing = pd.read_csv('c://users/intel/Desktop/Coursera/Regression/week5/2_kc_house_test_data.csv', dtype=dtype_dict)

In [2]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.array(features_sframe)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_sarray = np.array(output_sarray) # GraphLab Create>= 1.7!!
    return(features_matrix, output_sarray)

In [3]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [6]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [9]:
normalize_features(get_numpy_data(kc_data, ['sqft_living', 'bedrooms'], 'price')[:-1])

(array([[[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         ..., 
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]]]),
 array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00],
        ..., 
        [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00],
        [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00]]))

In [None]:
initial_weights = np.array([1.0,4.0,1.0])

# Effect of L1 penalty

In [12]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(kc_data, simple_features, my_output)

In [13]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)

In [14]:
weights = np.array([1., 4., 1.])

In [15]:
prediction = predict_output(simple_feature_matrix, weights)

In [19]:
simple_feature_matrix

array([[ 0.00680209,  0.00353021,  0.00583571],
       [ 0.00680209,  0.00768869,  0.00583571],
       [ 0.00680209,  0.00230361,  0.00389048],
       ..., 
       [ 0.00680209,  0.00305154,  0.00389048],
       [ 0.00680209,  0.00478673,  0.00583571],
       [ 0.00680209,  0.00305154,  0.00389048]])

In [40]:
ro=[]
for i in (0, 1, 2):
    #print(simple_feature_matrix[:,i])
    ro.append( sum( simple_feature_matrix[:,i]*(output - prediction + weights[i]*simple_feature_matrix[:,i]) ) )
print(ro)

[79400300.014523208, 87939470.823251516, 80966698.666239053]


In [41]:
sum( simple_feature_matrix[:,1]*(output - prediction + weights[1]*simple_feature_matrix[:,1]))

87939470.823251516

In [102]:
ro[1], ro[2], ro[1]*2, ro[2]*2

(87939470.823251516,
 80966698.666239053,
 175878941.64650303,
 161933397.33247811)

In [48]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = sum( feature_matrix[:,i]*(output - prediction + weights[i]*feature_matrix[:,i]) )

    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = (ro_i + l1_penalty/2)  
    elif ro_i > l1_penalty/2.:
        new_weight_i = (ro_i - l1_penalty/2)
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [50]:
import math
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],[2./math.sqrt(13),3./math.sqrt(10)]]), 
                                   np.array([1., 1.]), np.array([1., 4.]), 0.1) )

0.425558846691


# Cyclical coordinate descent

In [63]:
curr_coord_change = []
coord_change = []
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    enough = False 
    weights = np.array(initial_weights)
    while not enough:
        for i in range(len(weights)):
            old_weights_i = weights[i] # remember old value of weight[i], as it will be overwritten
        # the following line uses new values for weight[0], weight[1], ..., weight[i-1]
        #     and old values for weight[i], ..., weight[d-1]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
        # use old_weights_i to compute change in coordinate
            curr_coord_change.append(weights[i] - old_weights_i)
            coord_change.append([i, weights[i] - old_weights_i, weights[i], old_weights_i])
        if max(curr_coord_change) < tolerance:
            enough = True
        else: curr_coord_change[:] = []
    return(weights)

In [87]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [88]:
(simple_feature_matrix, output) = get_numpy_data(kc_data, simple_features, my_output)
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix) # normalize features

In [89]:
weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [66]:
weights

array([ 21624998.81906021,  63157246.42159428,         0.        ])

In [62]:
print(
    lasso_coordinate_descent_step(0, simple_feature_matrix, output, initial_weights, l1_penalty),
    lasso_coordinate_descent_step(1, simple_feature_matrix, output, initial_weights, l1_penalty),
    lasso_coordinate_descent_step(2, simple_feature_matrix, output, initial_weights, l1_penalty)
    )

11672925008.0 2.9394402547e+13 41618028485.0


curr_coord_change

In [65]:
coord_change

[[0, 79400304.637645125, 79400304.637645125, 0.0],
 [1, 10305258.704948779, 10305258.704948779, 0.0],
 [2, -299724.16960754152, -299724.16960754152, 0.0],
 [0, -9138168.3764282316, 70262136.261216894, 79400304.637645125],
 [1, 8642337.0598186441, 18947595.764767423, 10305258.704948779],
 [2, 299724.16960754152, 0.0, -299724.16960754152],
 [0, -8194809.5183830634, 62067326.74283383, 70262136.261216894],
 [1, 7213612.4987342916, 26161208.263501715, 18947595.764767423],
 [2, 0.0, 0.0, 0.0],
 [0, -6598905.0819197744, 55468421.660914056, 62067326.74283383],
 [1, 6036579.9088638164, 32197788.172365531, 26161208.263501715],
 [2, 0.0, 0.0, 0.0],
 [0, -5522173.2308196351, 49946248.430094421, 55468421.660914056],
 [1, 5051601.677036725, 37249389.849402256, 32197788.172365531],
 [2, 0.0, 0.0, 0.0],
 [0, -4621129.8408785611, 45325118.58921586, 49946248.430094421],
 [1, 4227340.6280883104, 41476730.477490567, 37249389.849402256],
 [2, 0.0, 0.0, 0.0],
 [0, -3867108.1318259537, 41458010.457389906, 45

In [90]:
RSS = sum((predict_output(normalized_simple_feature_matrix, weights) - output)**2)
print(RSS)

1.63049248458e+15


# Evaluating LASSO fit with more features

In [120]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']
my_output = 'price'
initial_weights = np.zeros(14)
l1_penalty = 1e7
tolerance = 1.0

In [121]:
(all_feature_matrix, output) = get_numpy_data(training, all_features, my_output)
(normalized_all_feature_matrix, all_norms) = normalize_features(all_feature_matrix) # normalize features

In [122]:
weights1e7 = lasso_cyclical_coordinate_descent(normalized_all_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [123]:
print('const', '=', weights1e7[0])
for i in range(len(all_features)):
    print(all_features[i], '=', weights1e7[i+1])

const = 24429601.0876
bedrooms = 0.0
bathrooms = 0.0
sqft_living = 48389173.9681
sqft_lot = 0.0
floors = 0.0
waterfront = 3317511.2138
view = 7329961.86436
condition = 0.0
grade = 0.0
sqft_above = 0.0
sqft_basement = 0.0
yr_built = 0.0
yr_renovated = 0.0


In [124]:
l1_penalty=1e8

In [125]:
weights1e8 = lasso_cyclical_coordinate_descent(normalized_all_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [126]:
print('const', '=', weights1e8[0])
for i in range(len(all_features)):
    print(all_features[i], '=', weights1e8[i+1])

const = 71114625.7149
bedrooms = 0.0
bathrooms = 0.0
sqft_living = 0.0
sqft_lot = 0.0
floors = 0.0
waterfront = 0.0
view = 0.0
condition = 0.0
grade = 0.0
sqft_above = 0.0
sqft_basement = 0.0
yr_built = 0.0
yr_renovated = 0.0


In [127]:
l1_penalty=1e4
tolerance=5e5

In [128]:
weights1e4 = lasso_cyclical_coordinate_descent(normalized_all_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [129]:
print('const', '=', weights1e4[0])
for i in range(len(all_features)):
    print(all_features[i], '=', weights1e4[i+1])

const = 78564738.3416
bedrooms = -22097398.9243
bathrooms = 12791071.8728
sqft_living = 93808088.0928
sqft_lot = -2013172.75705
floors = -4219184.93265
waterfront = 6482842.81754
view = 7127408.53481
condition = 5001664.8547
grade = 14327518.4371
sqft_above = -15770959.1524
sqft_basement = -5159591.22213
yr_built = -84495341.7684
yr_renovated = 2824439.49704


# Rescaling learned weights

In [91]:
weights1e8_norm = weights1e8/all_norms

In [92]:
weights1e7_norm = weights1e7/all_norms

In [93]:
weights1e4_norm = weights1e4/all_norms

In [95]:
print( weights1e7_norm[3])

161.317454968


# Evaluating each of the learned models on the test data

In [96]:
(test_feature_matrix, test_output) = get_numpy_data(testing, all_features, 'price')

In [97]:
def RSS(weights):
    RSS = sum((predict_output(test_feature_matrix, weights) - test_output)**2)
    print(RSS)

In [99]:
RSS(weights1e8_norm),
RSS(weights1e7_norm),
RSS(weights1e4_norm)

5.37166151497e+14
2.75962077477e+14
2.28459958971e+14
