# Gradient Descent

## Multiple Regression


In [1]:
import turicreate as tc

In [2]:
sales = tc.SFrame('home_data.SFrame')

In [3]:
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650.0,1.0,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242.0,2.0,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000.0,1.0,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000.0,1.0,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080.0,1.0,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930.0,1.0,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819.0,2.0,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711.0,1.0,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470.0,1.0,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560.0,2.0,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7.0,1180.0,0.0,1955.0,0.0,98178,47.51123398
0,3,7.0,2170.0,400.0,1951.0,1991.0,98125,47.72102274
0,3,6.0,770.0,0.0,1933.0,0.0,98028,47.73792661
0,5,7.0,1050.0,910.0,1965.0,0.0,98136,47.52082
0,3,8.0,1680.0,0.0,1987.0,0.0,98074,47.61681228
0,3,11.0,3890.0,1530.0,2001.0,0.0,98053,47.65611835
0,3,7.0,1715.0,0.0,1995.0,0.0,98003,47.30972002
0,3,7.0,1060.0,0.0,1963.0,0.0,98198,47.40949984
0,3,7.0,1050.0,730.0,1960.0,0.0,98146,47.51229381
0,3,7.0,1890.0,0.0,2003.0,0.0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [4]:
import numpy as np

In [5]:
def get_numpy_data(data_sframe, features, output):
    
    data_sframe['constant'] = 1 
    
    # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    
    features = ['constant'] + features
    
    # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    
    features_sframe = data_sframe[features]
    
    # the following line will convert the features_SFrame into a numpy matrix:
    
    feature_matrix = features_sframe.to_numpy()
    
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    
    output_sarray = data_sframe[output]
    
    # the following will convert the SArray into a numpy array by first converting it to a list
    
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

In [6]:
#testing

(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
# the [] around 'sqft_living' makes it a list

print (example_features[0,:]) 
# this accesses the first row of the data the ':' indicates 'all columns'

print (example_output[0]) 
# and the corresponding output

[1.00e+00 1.18e+03]
221900.0


In [7]:
def predict_output(feature_matrix, weights):
    
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    
    predictions = np.dot(feature_matrix, weights)

    return(predictions)

In [9]:
#testing

my_weights = np.array([1., 1.]) #init

test_predictions = predict_output(example_features, my_weights)

print (test_predictions[0]) # should be 1181.0
print (test_predictions[1]) # should be 2571.0

1181.0
2571.0


In [10]:
def feature_derivative(errors, feature):
    
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    
    derivative = 2*np.dot(errors, feature)
    return(derivative)

In [11]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 

my_weights = np.array([0., 0.]) 

# this makes all the predictions 0

test_predictions = predict_output(example_features, my_weights) 

# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 

errors = test_predictions - example_output 

# prediction errors in this case is just the -example_output

feature = example_features[:,0]

# let's compute the derivative with respect to 'constant', the ":" indicates "all rows"

derivative = feature_derivative(errors, feature)

print (derivative)
print (-np.sum(example_output)*2)
# should be the same as derivative

#-23345850022.0
#-23345850022.0

-23345850022.0
-23345850022.0


In [12]:
print (example_output, errors,feature)

[221900. 538000. 180000. ... 402101. 400000. 325000.] [-221900. -538000. -180000. ... -402101. -400000. -325000.] [1. 1. 1. ... 1. 1. 1.]


## Gradient Descent 

In [13]:
from math import sqrt

In [15]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    
    converged = False 
    weights = np.array(initial_weights)
    
    # make sure it's a numpy array
    
    while not converged:
        
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        
        predictions = predict_output(feature_matrix, weights)
        
        # compute the errors as predictions - output
        
        errors = predictions - output
        gradient_sum_squares = 0 
        
        # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        
        for i in range(len(weights)): 
            # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            
            derivative = feature_derivative(errors, feature_matrix[:, i])
            
            # add the squared value of the derivative to the gradient magnitude (for assessing convergence)
            
            gradient_sum_squares += (derivative**2)
            
            # subtract the step size times the derivative from the current weight
            
            weights[i] -= (step_size * derivative)
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        
        gradient_magnitude = sqrt(gradient_sum_squares)
        
        if gradient_magnitude < tolerance:
            converged = True
    
    return(weights)

## Running the Gradient Descent as Simple Regression

In [16]:
train_data,test_data = sales.random_split(.8,seed=0)

In [17]:
# let's test out the gradient descent

simple_features = ['sqft_living']

my_output = 'price'

(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)

initial_weights = np.array([-47000., 1.])

step_size = 7e-12

tolerance = 2.5e7

In [18]:
test_weight = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

print (test_weight)

[-46999.88716555    281.91211912]


In [19]:
#Quiz Question: What is the value of the weight for sqft_living -- 
#the second element of ‘simple_weights’ (rounded to 1 decimal place)?

(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

#Now compute your predictions using test_simple_feature_matrix and your weights from above.


test_predictions = predict_output(test_simple_feature_matrix, test_weight)

print (test_predictions)

[356134.44317093 784640.86422788 435069.83652353 ... 663418.65300782
 604217.10799338 240550.4743332 ]


In [20]:
#Quiz Question: What is the predicted price for the 1st house in the TEST data set for model 1 (round to nearest dollar)?

print (test_predictions[0])

356134.4431709297


In [21]:
#Recall that RSS is the sum of the squared errors (difference between prediction and output).

test_residuals = test_output - test_predictions
test_RSS = (test_residuals * test_residuals).sum()

print (test_RSS)

#2.75400047593e+14

275400047593155.94


## Running a multiple regression

In [22]:
model_features = ['sqft_living', 'sqft_living15'] 

# sqft_living15 is the average squarefeet for the nearest 15 neighbors. 

my_output = 'price'

(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)

initial_weights = np.array([-100000., 1., 1.])

step_size = 4e-12

tolerance = 1e9

In [23]:
weight_2 = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

print (weight_2)

[-9.99999688e+04  2.45072603e+02  6.52795277e+01]


In [24]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

test_predictions_2 = predict_output(test_feature_matrix, weight_2)

print (test_predictions_2)

[366651.41203656 762662.39786164 386312.09499712 ... 682087.39928241
 585579.27865729 216559.20396617]


In [25]:
#Quiz Question: What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?

print (test_predictions_2[0])

366651.4120365591


In [26]:
#actual price for the 1st house in the test data set?

print (test_data['price'][0])

310000.0


In [27]:
#Quiz Question: Which estimate was closer to the true price for the 1st house on the Test data set, model 1 or model 2?

#Now use your predictions and the output to compute the RSS for model 2 on TEST data.


test_residuals_2 = test_output - test_predictions_2
test_RSS_2 = (test_residuals_2**2).sum()

print (test_RSS_2)

#2.70263446465e+14

270263446465244.06


# Interpretation

## Multiple Regression

In [29]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_model = tc.linear_regression.create(train_data, target = 'price', features = example_features, 
                                                  validation_set = None)

In [30]:
example_model.coefficients

name,index,value,stderr
(intercept),,87910.07249240018,7873.338143401675
sqft_living,,315.40344055209926,3.4557003258547327
bedrooms,,-65080.215552828566,2717.456854420713
bathrooms,,6944.020192639225,3923.1149314414847


## Making Predictions

In [31]:
example_predictions = example_model.predict(train_data)
print (example_predictions[0])

# should be 271789.505878

271789.5058780308


## Compute RSS

In [32]:
def get_residual_sum_of_squares(model, data, outcome):
    
    # First get the predictions
    
    predictions = model.predict(data)
    
    # Then compute the residuals/errors
    
    residuals = outcome - predictions
    
    # Then square and add them up
    
    RSS = (residuals * residuals).sum()
    
    return(RSS)

In [33]:
rss_example_train = get_residual_sum_of_squares(example_model, test_data, test_data['price'])
print (rss_example_train) 

# should be 2.7376153833e+14

273761538330193.2


In [34]:
from math import log

In [35]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

# create the remaining 3 features in both TEST and TRAIN data

train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']

train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))

train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [36]:
train_data[['bedrooms','bathrooms','lat','long','bedrooms_squared','bed_bath_rooms','log_sqft_living','lat_plus_long']].head()

bedrooms,bathrooms,lat,long,bedrooms_squared,bed_bath_rooms,log_sqft_living
3.0,1.0,47.51123398,-122.25677536,9.0,3.0,7.07326971745971
3.0,2.25,47.72102274,-122.3188624,9.0,6.75,7.851661177889265
2.0,1.0,47.73792661,-122.23319601,4.0,2.0,6.646390514847729
4.0,3.0,47.52082,-122.39318505,16.0,12.0,7.580699752224563
3.0,2.0,47.61681228,-122.04490059,9.0,6.0,7.426549072397305
4.0,4.5,47.65611835,-122.00528655,16.0,18.0,8.597851094433691
3.0,2.25,47.30972002,-122.32704857,9.0,6.75,7.44716835960004
3.0,1.5,47.40949984,-122.31457273,9.0,4.5,6.966024187106113
3.0,1.0,47.51229381,-122.33659507,9.0,3.0,7.484368643286131
3.0,2.5,47.36840673,-122.0308176,9.0,7.5,7.544332108053688

lat_plus_long
-74.74554138
-74.59783966
-74.4952694
-74.87236505
-74.42808830999999
-74.3491682
-75.01732855
-74.90507288999999
-74.82430126
-74.66241087


In [38]:
#Quiz Question: What is the mean (arithmetic average) value of your 4 new features on TEST data? (round to 2 digits)


print (test_data['bedrooms_squared'].mean())
print (test_data['bed_bath_rooms'].mean())
print (test_data['log_sqft_living'].mean())
print (test_data['lat_plus_long'].mean())

12.44667770158429
7.503901631591395
7.55027467964594
-74.65333497217306


## Learning Multiple Models

In [39]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [40]:
# Learn the three models: (don't forget to set validation_set = None)

model_1 = tc.linear_regression.create(train_data, target = 'price', features = model_1_features, 
                                                  validation_set = None)
model_2 = tc.linear_regression.create(train_data, target = 'price', features = model_2_features, 
                                                  validation_set = None)
model_3 = tc.linear_regression.create(train_data, target = 'price', features = model_3_features, 
                                                  validation_set = None)

In [41]:
print(model_1.coefficients,model_2.coefficients,model_3.coefficients)

+-------------+-------+---------------------+--------------------+
|     name    | index |        value        |       stderr       |
+-------------+-------+---------------------+--------------------+
| (intercept) |  None |  -56140675.74114401 | 1649985.4204360154 |
| sqft_living |  None |  310.2633257769215  | 3.1888296040774544 |
|   bedrooms  |  None | -59577.116067596675 | 2487.279773224323  |
|  bathrooms  |  None |  13811.840541653153 | 3593.5421329700102 |
|     lat     |  None |  629865.7894714825  | 13120.710032334237 |
|     long    |  None | -214790.28516470865 | 13284.285161661053 |
+-------------+-------+---------------------+--------------------+
[6 rows x 4 columns]
 +----------------+-------+---------------------+--------------------+
|      name      | index |        value        |       stderr       |
+----------------+-------+---------------------+--------------------+
|  (intercept)   |  None |  -54410676.11184223 | 1650405.1655748067 |
|  sqft_living   |  None |  

## Comparing multiple models

In [42]:
# Compute the RSS on TRAINING data for each of the three models and record the values:

rss_model_1_train = get_residual_sum_of_squares(model_1, train_data, train_data['price'])
rss_model_2_train = get_residual_sum_of_squares(model_2, train_data, train_data['price'])
rss_model_3_train = get_residual_sum_of_squares(model_3, train_data, train_data['price'])

print (rss_model_1_train)
print (rss_model_2_train)
print (rss_model_3_train)

971328233545433.6
961592067857508.5
905276314551641.0


In [43]:
#2.26568089093e+14
#2.24368799994e+14
#2.51829318952e+14