In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)

In [4]:
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [5]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features

    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_matrix = data[features].as_matrix(columns=None)

    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    # features_matrix = features
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’

    # this will convert the SArray into a numpy array:
    output_array = data[output].as_matrix(columns=None) # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [19]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [7]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(feature, errors)
    return(derivative)

In [17]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares + (derivative ** 2)
            
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size * derivative
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [9]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [16]:
np.dot(simple_feature_matrix, output)

ValueError: shapes (17384,2) and (17384,) not aligned: 2 (dim 1) != 17384 (dim 0)

In [20]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,                                             tolerance)

foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo
foo


In [11]:
simple_feature_matrix

array([[  1.00000000e+00,   1.18000000e+03],
       [  1.00000000e+00,   2.57000000e+03],
       [  1.00000000e+00,   7.70000000e+02],
       ..., 
       [  1.00000000e+00,   1.53000000e+03],
       [  1.00000000e+00,   1.60000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [276]:
simple_weights

array([-46999.88716555,    281.91211918])

In [296]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)


In [297]:
test_predictions = predict_outcome(test_simple_feature_matrix, simple_weights)

In [301]:
test_predictions[0]

356134.44325500238

In [302]:
rss_test = np.sum((test_predictions - test_output) **2)

In [303]:
rss_test

275400044902128.31

In [304]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output2) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
my_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size,                                             tolerance)

In [305]:
my_weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

In [312]:
(test_feature_matrix, output2) = get_numpy_data(test_data,model_features,my_output)
second_test_predictions = predict_outcome(test_feature_matrix, my_weights)

In [313]:
second_test_predictions[0]

366651.41162949387

In [314]:
test_data['price'][0]

310000.0

In [315]:
rss_test2 = np.sum((second_test_predictions- output2)  **2)

In [316]:
rss_test2

270263443629803.56

In [317]:
rss_test > rss_test2

True