## Week two - Multiple Linear Regression
### Assignment Two

<p>First, import necessary libraries. Then load test and train data into them.</p>

In [34]:
import pandas as pd
import numpy as np
import math

In [35]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
training_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

### Function to return numpy output matrix

In [36]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1 # add a constant column to a dataframe
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_frame given by the ‘features’ list into the Frame ‘features_frame’
    features_frame = data_frame[list(features)]
    # this will convert the features_sframe into a numpy matrix
    features_matrix = features_frame.as_matrix()
    # assign the column of data_frame associated with the target to the variable ‘output_array’
    output_column = data_frame[output]
    # this will convert the series into a numpy array:
    output_array = output_column.as_matrix()
    return(features_matrix, output_array)

### Function returns predictions given feature values and weights

In [56]:
def predict_outcome(feature_matrix, weights):
    return np.dot(feature_matrix, weights)

In [57]:
def feature_derivative(errors, feature):
    return 2*np.dot(errors,feature)

### Function to implement the gradient descent algorightm 

In [58]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        errors = predict_outcome(feature_matrix, weights) - output
        derivative = feature_derivative(errors, feature_matrix)
        gradient_sum_squares = sum(derivative**2)
        # update the weight based on step size and derivative:
        weights = weights - step_size*derivative
        if math.sqrt(gradient_sum_squares) < tolerance:
            converged = True
    return(weights)

### Train model using the following values

In [59]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(training_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [60]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [61]:
simple_weights

array([-46999.88716555,    281.91211918])

### Test model 1 on test data

In [62]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [63]:
predicted_val = np.dot(test_simple_feature_matrix[0],simple_weights)

In [64]:
predicted_val

356134.44325500238

### Calculate the RSS on our test values

In [65]:
predictions = simple_weights[0] + (test_simple_feature_matrix*simple_weights)[:,1]

In [66]:
rss = sum((predictions - test_output) ** 2)

In [67]:
rss

275400044902128.78

### Use new features to create a second predictive model

In [68]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(training_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [69]:
complex_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)


### Test model 2 on test data

In [70]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
predicted_val = np.dot(test_feature_matrix[0],complex_weights)

In [71]:
predicted_val

366651.41162949387

### Calculate the RSS on our test values

In [72]:
predictions = np.dot(test_feature_matrix,complex_weights)
rss = sum((predictions - test_output) ** 2)

In [73]:
rss

270263443629803.31