In [1]:
import pandas as pd
import numpy as np

In [2]:
kc_test = pd.read_csv('c://users/intel/Desktop/Coursera/Regression/week2/kc_house_test_data.csv', dtype= {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})
kc_train = pd.read_csv('c://users/intel/Desktop/Coursera/Regression/week2/kc_house_train_data.csv', dtype= {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})

In [3]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.array(features_sframe)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_sarray = np.array(output_sarray) # GraphLab Create>= 1.7!!
    return(features_matrix, output_sarray)

In [4]:
(example_features, example_output) = get_numpy_data(kc_train, ['sqft_living'], 'price')

In [5]:
print(example_features[0, :]) # this accesses the first row of the data the ':' indicates 'all columns'
print(example_output[0]) # and the corresponding output
print(type(example_features))
print(type(example_output))

[  1.00000000e+00   1.18000000e+03]
221900.0
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


# Predicting output given regression weights

In [6]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print (predicted_value)

1181.0


In [7]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [8]:
test_predictions = predict_output(example_features, my_weights)
print (test_predictions[0]) # should be 1181.0
print (test_predictions[1] )# should be 2571.0

1181.0
2571.0


# Computing the Derivative

In [9]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2* np.dot(errors, feature)
    return(derivative)

In [13]:
(example_features, example_output) = get_numpy_data(kc_train, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print( derivative)
print( -np.sum(example_output)*2) # should be the same as derivative

-18752698920.0
-18752698920.0


In [14]:
feature_1 = example_features[:,1] # let's compute the derivative with respect to 'w1', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature_1)
print( derivative)

-4.73325137065e+13


# Gradient Descent

In [11]:
from math import sqrt # recall that the magnitude/length of a vector [g[0], g[1], g[2]] is sqrt(g[0]^2 + g[1]^2 + g[2]^2)

In [30]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        error = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(error, feature_matrix[:, i])
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares += derivative**2
            # subtract the step size times the derivative from the current weight
            weights[i] -= step_size*derivative
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

# Running the Gradient Descent as Simple Regression

In [57]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(kc_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [59]:
regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

array([-46999.88716555,    281.91211918])

In [40]:
model1_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [39]:
(test_simple_feature_matrix, test_output) = get_numpy_data(kc_test, simple_features, my_output)

In [47]:
test_predictions = predict_output(test_simple_feature_matrix, model1_weights)

In [46]:
round(predict_output(test_simple_feature_matrix, model1_weights)[0], 0)

356132.0

In [49]:
model_1_RSS = sum((test_predictions - test_output)**2)
print(model_1_RSS)

2.75400198475e+14


# Running a multiple regression

In [60]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(kc_train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [61]:
regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

In [62]:
model_2_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [63]:
(feature_matrix_2, output_2) = get_numpy_data(kc_test, model_features, my_output)

In [64]:
predictions_2 = predict_output(feature_matrix_2, model_2_weights)

In [66]:
round(predictions_2[0], 0), round(output_2[0], 0)

(366651.0, 310000.0)

In [67]:
model_2_RSS = sum((predictions_2 - output_2)**2)
print(model_2_RSS)

2.7026344363e+14
