In [1]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [2]:
import pandas as pd
import numpy as np
import math
%matplotlib inline

In [3]:
sales_train = pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
sales_test = pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)

In [4]:
def get_numpy_data(data, features, output):
    ones = np.ones(data.shape[0])
    data['constant'] = ones
    all_features = ['constant']+features
    features_matrix = data.as_matrix(all_features)
    output_array = data.as_matrix([output])[:,0]
    return(features_matrix, output_array)

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [6]:
def feature_derivative(errors, feature):
    derivative = 2.0* np.dot(feature, errors)
    return(derivative)

In [7]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])

            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative**2.0

            # update the weight based on step size and derivative:
            weights[i] -= step_size*derivative 
            
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [8]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(sales_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [9]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [10]:
simple_weights

array([-46999.88716555,    281.91211918])

In [11]:
simple_features = ['sqft_living']
my_output = 'price'
(test_simple_feature_matrix, test_output) = get_numpy_data(sales_test, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [12]:
test_simple_weights = regression_gradient_descent(test_simple_feature_matrix, test_output, initial_weights, step_size, tolerance)

In [13]:
test_simple_weights

array([-46999.87880043,    282.3594539 ])

In [14]:
test_predicted_price_house = test_simple_weights[0] + test_simple_weights[1]*sales_test['sqft_living']

In [15]:
price_simple_model = test_predicted_price_house[0]
price_simple_model

356774.14027533506

## 12

In [16]:
test_rss = ((test_predicted_price_house-sales_test['price'])**2).sum()

In [17]:
test_rss

275395691278133.28

## 13

In [18]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(sales_train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [19]:
train_advanced_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [20]:
train_advanced_weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

## 14

In [21]:
test_advanced_predicted_price_house = train_advanced_weights[0] + train_advanced_weights[1]*sales_test['sqft_living'] + train_advanced_weights[2]*sales_test['sqft_living15']

## 15

In [30]:
price_advanced_model = test_advanced_predicted_price_house[0]
price_advanced_model

366651.41162949387

## 16

In [31]:
actual_price = sales_test['price'][0]
actual_price

310000.0

In [32]:
price_advanced_model - actual_price

56651.411629493872

In [33]:
price_simple_model - actual_price

46774.140275335056

## 17

The simple model (1) has the lowest error

## 18

In [35]:
test_advanced_rss = ((test_advanced_predicted_price_house-sales_test['price'])**2).sum()

In [36]:
test_advanced_rss

270263443629803.34

## 19

In [37]:
test_advanced_rss < test_rss

True