### Load data
* Train and Test data load

In [3]:
import pandas as pd
import numpy as np

In [4]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [5]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [6]:
test_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000,3,1.0,1430,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780,12697
1,9297300055,20150124T000000,650000,4,3.0,2950,5000,2.0,0,3,...,9,1980,970,1979,0,98126,47.5714,-122.375,2140,4000
2,1202000200,20141103T000000,233000,3,2.0,1710,4697,1.5,0,0,...,6,1710,0,1941,0,98002,47.3048,-122.218,1030,4705
3,8562750320,20141110T000000,580500,3,2.5,2320,3980,2.0,0,0,...,8,2320,0,2003,0,98027,47.5391,-122.07,2580,3980
4,7589200193,20141110T000000,535000,3,1.0,1090,3000,1.5,0,0,...,8,1090,0,1929,0,98117,47.6889,-122.375,1570,5080


In [9]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1
    features = ['constant'] + features
    
    features_frame = data_frame[features]
    
    features_matrix = features_frame.values
    output_array = data_frame[output].values
    
    return features_matrix, output_array

In [11]:
features_matrix, output_array = get_numpy_data(train_data, ['sqft_living', 'bedrooms'], 'price')

In [15]:
print type(features_matrix), type(output_array), features_matrix.shape, output_array.shape

<type 'numpy.ndarray'> <type 'numpy.ndarray'> (17384, 3) (17384,)


In [23]:
def predict_outcome(features_matrix, weights):
    return features_matrix.dot(np.transpose(weights))

In [31]:
def feature_derivative(errors, feature):
    return 2 * feature.dot(errors)

In [36]:
def regression_gradient_descent(features_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    
    while not converged:
        predictions = features_matrix.dot(weights)
        errors = predictions - output
               
        gradient_sum_squares = 0
        for i in range(len(weights)):
            feature = features_matrix[:, i]
            derivative = feature_derivative(errors, feature)                        
            gradient_sum_squares += derivative ** 2
            weights[i] = weights[i] - step_size * derivative
        
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
            
    return weights

In [65]:
simple_features = ['sqft_living']
my_output = 'price'
simple_feature_matrix, output = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [66]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [67]:
simple_weights

array([-46999.88716555,    281.91211918])

In [68]:
test_simple_features = ['sqft_living']
test_simple_feature_matrix, test_output = get_numpy_data(test_data, test_simple_features, my_output)

In [61]:
test_weights = regression_gradient_descent(test_simple_feature_matrix, test_output, initial_weights, step_size, tolerance)

In [62]:
test_weights

array([-46999.87880043,    282.3594539 ])

In [69]:
test_outcome = predict_outcome(test_simple_feature_matrix, simple_weights)

In [70]:
test_outcome[0]

356134.44325500238

In [49]:
def get_rss(predictions, target):
    return np.sum((predictions - target) ** 2)
    

In [50]:
get_rss(test_outcome, test_output)

275395691278132.81

In [51]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
feature_matrix, output = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [52]:
model_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [53]:
test_feature_matrix, test_output = get_numpy_data(test_data, model_features, my_output)

In [54]:
model2_predictions = predict_outcome(test_feature_matrix, model_weights)

In [55]:
print model2_predictions[0], test_output[0]

366651.411629 310000.0


In [56]:
get_rss(model2_predictions, test_output)

270263443629803.56