In [2]:
import graphlab
import numpy as np

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] This non-commercial license of GraphLab Create is assigned to aw2341@nyu.edu and will expire on January 04, 2017. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-3052 - Server binary: /anaconda/envs/py27/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1453309531.log
[INFO] GraphLab Server Version: 1.8


## Transform SFrame data to Numpy Array data

In [4]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1
    features = ['constant'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    output_sarray = data_sframe[output]
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

### TEST

In [5]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0] # and the corresponding output

[  1.00000000e+00   1.18000000e+03]
221900.0


## Predicting Output Given Regression Weights

In [6]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print predicted_value

1181.0


In [7]:
def predict_output(feature_matrix, weights):
    return np.dot(feature_matrix, weights)

### TEST

In [8]:
test_predictions = predict_output(example_features, my_weights)
print test_predictions[0] # should be 1181.0
print test_predictions[1] # should be 2571.0

1181.0
2571.0


## Computing the Derivative

In [9]:
def feature_derivative(errors, feature):
    return 2 * np.dot(errors, feature)

### TEST

In [10]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative

-23345850022.0
-23345850022.0


## Gradient Descent

In [11]:
import math

In [49]:
def regression_gradient_descent(feature_matrix, output, 
                                initial_weights, step_size,
                                tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_output(feature_matrix, weights)
        errors = predictions - output
       
        gradient_sum_squares = 0.0
        
        for i in range(len(weights)):
            derivative_i = feature_derivative(errors,feature_matrix[:,i])
            
            gradient_sum_squares += derivative_i **2
            weights[i] -= step_size * derivative_i #adjust weights for feature i
        gradient_magnitude = math.sqrt(gradient_sum_squares)
       
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

## Running the Gradient Descent as Simple Regression

In [50]:
train_data, test_data = sales.random_split(.8, seed=0)

In [51]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, 
                                                 simple_features,
                                                my_output)

initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [52]:
simple_weights = regression_gradient_descent(simple_feature_matrix,
                                            output,
                                            initial_weights,
                                            step_size,
                                            tolerance)

## TEST SIMPLE MODEL

In [26]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data,
                                                          simple_features,
                                                          my_output)
predictions = predict_output(test_simple_feature_matrix, simple_weights)

## Quiz Question 1
281.9

In [25]:
print simple_weights

[-46999.88716555    281.91211912]


## Quiz Question 2
$356134

In [33]:
print predictions

[ 356134.44317093  784640.86422788  435069.83652353 ...,  663418.65300782
  604217.10799338  240550.4743332 ]


In [55]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [56]:
second_weights = regression_gradient_descent(feature_matrix,
                                            output,
                                            initial_weights,
                                            step_size,
                                            tolerance)

## TEST SECOND MODEL

In [58]:
(test_model_features_matrix, test_output ) = get_numpy_data(test_data,
                                                           model_features,
                                                           my_output)
predictions_second = predict_output(test_model_features_matrix,second_weights)

## Quiz Question 3
$276660

In [77]:
print predictions_second[0]

366651.412037


In [78]:
print test_data[0]['price'] # Actual price of first house in test data

310000.0


## Quiz Question 4 
Model 2 performed better

In [61]:
print abs(predictions_second[0] - test_data[0]['price']) > abs(
    predictions[0] - test_data[0]['price'])

True


In [72]:
def compute_RSS(predictions, true_values):
    RSS = 0
    for i in range(len(predictions)):
        RSS += (predictions[i] - true_values[i]) ** 2
    return RSS

In [75]:
RSS_model_1 = compute_RSS(predictions, test_data['price'])
print RSS_model_1

2.75400047593e+14


In [76]:
RSS_model_2 = compute_RSS(predictions_second, test_data['price'])
print RSS_model_2

2.70263446465e+14


## Quiz Question 5
Model 2 has the smaller RSS on test_data