In [1]:
import numpy as np
import pandas as pd
import matplotlib
import sklearn
%matplotlib inline

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
df_data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
print df_data.shape

(21613, 21)


In [5]:
print df_data.columns

Index([u'id', u'date', u'price', u'bedrooms', u'bathrooms', u'sqft_living',
       u'sqft_lot', u'floors', u'waterfront', u'view', u'condition', u'grade',
       u'sqft_above', u'sqft_basement', u'yr_built', u'yr_renovated',
       u'zipcode', u'lat', u'long', u'sqft_living15', u'sqft_lot15'],
      dtype='object')


In [6]:
df_train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
df_test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [7]:
print df_train.shape, df_test.shape

(17384, 21) (4229, 21)


In [8]:
df_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [9]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [10]:
sqft = df_train['sqft_living'].as_matrix()

In [11]:
df_train['constant'] = 1

In [12]:
features = df_train[['constant','sqft_living']].values

In [13]:
features

array([[  1.00000000e+00,   1.18000000e+03],
       [  1.00000000e+00,   2.57000000e+03],
       [  1.00000000e+00,   7.70000000e+02],
       ..., 
       [  1.00000000e+00,   1.53000000e+03],
       [  1.00000000e+00,   1.60000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [14]:
features[0,:]

array([  1.00000000e+00,   1.18000000e+03])

In [15]:
#sales = df_train.as_matrix()

In [16]:
my_features = features[0,:]
my_weights = np.array([1., 1.])
predicted_value = np.dot(my_features, my_weights)
print predicted_value

1181.0


In [17]:
test_predictions = predict_output(features, my_weights)
print test_predictions[0] # should be 1181.0
print test_predictions[1] # should be 2571.0

1181.0
2571.0


Since the derivative of a sum is the sum of the derivatives we can compute the derivative for a single data point and then sum over data points. We can write the squared difference between the observed output and predicted output for a single point as follows:

(w[0]*[CONSTANT] + w[1]*[feature_1] + ... + w[i] *[feature_i] + ... + w[k]*[feature_k] - output)^2

Where we have k features and a constant. So the derivative with respect to weight w[i] by the chain rule is:

2*(w[0]*[CONSTANT] + w[1]*[feature_1] + ... + w[i] *[feature_i] + ... + w[k]*[feature_k] - output)* [feature_i]

The term inside the paranethesis is just the error (difference between prediction and output). So we can re-write this as:

2*error*[feature_i]

That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!

Recall that twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors.

With this in mind complete the following derivative function which computes the derivative of the weight given the value of the feature (over all data points) and the errors (over all data points).

In [18]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2*np.dot(errors,feature)
    return(derivative)

In [19]:
output = df_train['price'].values

In [20]:
features.shape, output.shape

((17384, 2), (17384,))

In [21]:
output

array([ 221900.,  538000.,  180000., ...,  360000.,  400000.,  325000.])

In [22]:
features[:,0]

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [23]:
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(features, my_weights) 

In [24]:
errors = test_predictions - output

In [25]:
feature = features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"

In [26]:
errors.shape, feature.shape

((17384,), (17384,))

In [27]:
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(output)*2 # should be the same as derivative

-18752698920.0
-18752698920.0


In [28]:
derivative**2

3.5166371678416916e+20

In [29]:
from math import sqrt # recall that the magnitude/length of a vector [g[0], g[1], g[2]] is sqrt(g[0]^2 + g[1]^2 + g[2]^2)

In [30]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:,i])
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares += derivative**2
            # subtract the step size times the derivative from the current weight
            weights[i] = weights[i] - step_size*derivative
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [31]:
# let's test out the gradient descent
simple_features = features #sqft_living
my_output = output
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [32]:
simple_features.shape, output.shape

((17384, 2), (17384,))

In [33]:
result = regression_gradient_descent(simple_features, output, initial_weights, step_size, tolerance)

In [34]:
result

array([-46999.88716555,    281.91211918])

In [35]:
#Compare result with Sklearn linear regresssion
from sklearn import linear_model

In [36]:
# Create linear regression object
regr = linear_model.LinearRegression()

X_train = df_train['sqft_living'].values
y_train = df_train['price'].values

In [37]:
X_train = X_train.reshape(len(X_train), 1)
y_train = y_train.reshape(len(y_train), 1)

In [38]:
# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
# The coefficients
print('Coefficients: \n', regr.coef_)
print ('Intercept: ', regr.intercept_)
# The mean square error
#print("Residual sum of squares: %.2f"
#      % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
# Explained variance score: 1 is perfect prediction
#print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test))

('Coefficients: \n', array([[ 281.95883963]]))
('Intercept: ', array([-47116.07907289]))


In [44]:
# let's test out the gradient descent
features = df_train[['constant','sqft_living','sqft_living15']].values

multiple_features = features #sqft_living + sqft_living15
my_output = output
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [45]:
result = regression_gradient_descent(multiple_features, output, initial_weights, step_size, tolerance)
print result

[ -9.99999688e+04   2.45072603e+02   6.52795267e+01]


In [47]:
# Compare Multiple regression with sklearn
X_train = df_train[['sqft_living','sqft_living15']].values
y_train = df_train['price'].values
#X_train = X_train.reshape(len(X_train), 1)
#y_train = y_train.reshape(len(y_train), 1)
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [48]:
# The coefficients
print('Coefficients: \n', regr.coef_)
print ('Intercept: ', regr.intercept_)

('Coefficients: \n', array([ 245.18871442,   65.27158522]))
('Intercept: ', -100262.17515853385)


In [49]:
result[0] - regr.intercept_

262.20630965766031

In [50]:
df_test.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000,3,1.0,1430,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780,12697
1,9297300055,20150124T000000,650000,4,3.0,2950,5000,2.0,0,3,...,9,1980,970,1979,0,98126,47.5714,-122.375,2140,4000
2,1202000200,20141103T000000,233000,3,2.0,1710,4697,1.5,0,0,...,6,1710,0,1941,0,98002,47.3048,-122.218,1030,4705
3,8562750320,20141110T000000,580500,3,2.5,2320,3980,2.0,0,0,...,8,2320,0,2003,0,98027,47.5391,-122.07,2580,3980
4,7589200193,20141110T000000,535000,3,1.0,1090,3000,1.5,0,0,...,8,1090,0,1929,0,98117,47.6889,-122.375,1570,5080


In [59]:
# prediction
df_test['constant'] = 1
test_data = df_test.ix[0,['constant','sqft_living','sqft_living15']].values

In [60]:
test_data

array([1, 1430.0, 1780.0], dtype=object)

In [61]:
print predict_output(test_data, result) 

366651.411629


In [69]:
test_all_data = df_test[['sqft_living','sqft_living15']].values

In [70]:
pred  = regr.predict(test_all_data)

In [71]:
print pred[0]

366541.108167
