In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# A

In [2]:
def get_RSS(y, y_hat):
    error = y - y_hat
    RSS = sum(map(lambda x: x*x,error))
    return RSS

In [3]:
data_full = pd.read_csv("kc_house_data.csv")
data_test = pd.read_csv("kc_house_test_data.csv")
data_train = pd.read_csv("kc_house_train_data.csv")

In [4]:
data_full["bedrooms_squared"] = data_full["bedrooms"] * data_full["bedrooms"]
data_test["bedrooms_squared"] = data_test["bedrooms"] * data_test["bedrooms"]
data_train["bedrooms_squared"] = data_train["bedrooms"] * data_train["bedrooms"]

data_full["bed_bath_rooms"] = data_full["bedrooms"] * data_full["bathrooms"]
data_test["bed_bath_rooms"] = data_test["bedrooms"] * data_test["bathrooms"]
data_train["bed_bath_rooms"] = data_train["bedrooms"] * data_train["bathrooms"]

data_full["log_sqft_living"] = np.log(data_full["sqft_living"])
data_test["log_sqft_living"] = np.log(data_test["sqft_living"])
data_train["log_sqft_living"] = np.log(data_train["sqft_living"])

data_full["lat_plus_long"] = data_full["lat"] + data_full["long"]
data_test["lat_plus_long"] = data_test["lat"] + data_test["long"]
data_train["lat_plus_long"] = data_train["lat"] + data_train["long"]

In [5]:
model1 = smf.ols("price ~ sqft_living + bedrooms + bathrooms + lat + long", data = data_train)
model1_results = model1.fit()
print(model1_results.summary())

model2 = smf.ols("price ~ sqft_living + bedrooms + bathrooms + lat + long + bed_bath_rooms", data = data_train)
model2_results = model2.fit()
print(model2_results.summary())

model3 = smf.ols("price ~ sqft_living + bedrooms + bathrooms + lat + long + bed_bath_rooms" +
                 "+ bedrooms_squared + log_sqft_living + lat_plus_long", data = data_train)
model3_results = model3.fit()
print(model3_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.593
Model:                            OLS   Adj. R-squared:                  0.592
Method:                 Least Squares   F-statistic:                     5056.
Date:                Sun, 21 May 2017   Prob (F-statistic):               0.00
Time:                        13:08:20   Log-Likelihood:            -2.3973e+05
No. Observations:               17384   AIC:                         4.795e+05
Df Residuals:                   17378   BIC:                         4.795e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept   -6.908e+07   1.65e+06    -41.940      

# Question a1: What is the mean value (arithmetic average) of the 'bedrooms_squared' feature on TEST data? (round to 2 decimal places) 

In [6]:
"{:.2f}".format(np.average(data_test["bedrooms_squared"]))

'12.45'

# Question a2: What is the mean value (arithmetic average) of the 'bed_bath_rooms' feature on TEST data? (round to 2 decimal places) 

In [7]:
"{:.2f}".format(np.average(data_test["bed_bath_rooms"]))

'7.50'

# Question a3: What is the mean value (arithmetic average) of the 'log_sqft_living' feature on TEST data? (round to 2 decimal places) 

In [8]:
"{:.2f}".format(np.average(data_test["log_sqft_living"]))

'7.55'

# Question a4: What is the mean value (arithmetic average) of the 'lat_plus_long' feature on TEST data? (round to 2 decimal places)

In [9]:
"{:.2f}".format(np.average(data_test["lat_plus_long"]))

'-74.65'

# Question a5: What is the sign (positive or negative) for the coefficient/weight for 'bathrooms' in model 1?

In [10]:
if model1_results.params["bathrooms"] > 0:
    print("+")
elif model1_results.params["bathrooms"] < 0:
    print("-")
else:
    print("0")

+


# Question a6: What is the sign (positive or negative) for the coefficient/weight for 'bathrooms' in model 2?

In [11]:
if model2_results.params["bathrooms"] > 0:
    print("+")
elif model2_results.params["bathrooms"] < 0:
    print("-")
else:
    print("0")

-


# Question a7: Which model (1, 2 or 3) has lowest RSS on TRAINING Data?

In [12]:
RSS_m1_train = sum((model1_results.resid**2))
RSS_m2_train = sum((model2_results.resid**2))
RSS_m3_train = sum((model3_results.resid**2))

print("Model 1: RSS_train", RSS_m1_train)
print("Model 2: RSS_train", RSS_m2_train)
print("Model 3: RSS_train", RSS_m3_train)

if RSS_m1_train < RSS_m2_train and RSS_m2_train < RSS_m3_train:
    print("Minimum RSS for training data is Model 1")
    
elif RSS_m1_train < RSS_m2_train and RSS_m2_train >= RSS_m3_train:
    if RSS_m1_train < RSS_m3_train:
        print("Minimum RSS for training data is Model 1")
    else:
        print("Minimum RSS for training data is Model 3")
elif RSS_m1_train >= RSS_m2_train and RSS_m2_train < RSS_m3_train:
    print("Minimum RSS for training data is Model 2")
else:
    print("Minimum RSS for training data is Model 3")

Model 1: RSS_train 9.6787996305e+14
Model 2: RSS_train 9.58419635074e+14
Model 3: RSS_train 9.0343645505e+14
Minimum RSS for training data is Model 3


# Question a8: Which model (1, 2 or 3) has lowest RSS on TESTING Data?

In [13]:
RSS_m1_test = get_RSS(data_test["price"], model1_results.predict(data_test))
RSS_m2_test = get_RSS(data_test["price"], model2_results.predict(data_test))
RSS_m3_test = get_RSS(data_test["price"], model3_results.predict(data_test))

print("Model 1: RSS_test", RSS_m1_test)
print("Model 2: RSS_test", RSS_m2_test)
print("Model 3: RSS_test", RSS_m3_test)

if RSS_m1_test < RSS_m2_test and RSS_m2_test < RSS_m3_test:
    print("Minimum RSS for training data is Model 1")
    
elif RSS_m1_test < RSS_m2_test and RSS_m2_test >= RSS_m3_test:
    if RSS_m1_test < RSS_m3_test:
        print("Minimum RSS for training data is Model 1")
    else:
        print("Minimum RSS for training data is Model 3")
elif RSS_m1_test >= RSS_m2_test and RSS_m2_test < RSS_m3_test:
    print("Minimum RSS for training data is Model 2")
else:
    print("Minimum RSS for training data is Model 3")

Model 1: RSS_test 2.25500469795e+14
Model 2: RSS_test 2.23377462976e+14
Model 3: RSS_test 2.59236319207e+14
Minimum RSS for training data is Model 2


# B

In [43]:
def get_numpy_data(data,features,output):
    data["constant"] = 1
    features_matrix = data["constant"]
    for i in features:
        features_matrix = pd.concat([features_matrix,data[i]], axis=1)

    features_matrix = np.matrix(features_matrix)
    
    output_array = np.transpose(np.matrix(data[output])) # Transpose is used to create a column vector
    return (features_matrix, output_array)

In [44]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return predictions

In [55]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    weights = deepcopy(initial_weights)

    converged = False

    while not converged:
        output_estimate = predict_outcome(feature_matrix, weights)
        gradient_sum_squares = 0
        derivative = -2 * np.dot(np.transpose(feature_matrix),output-output_estimate)
        for i in range(0,len(weights)):  
            gradient_sum_squares = gradient_sum_squares + derivative[i] * derivative[i]
            weights[i] = weights[i] - step_size * derivative[i]

        if np.sqrt(gradient_sum_squares) < tolerance:
                converged = True
    return weights

In [56]:
def regression_gradient_descent2(feature_matrix, output, initial_weights, step_size, tolerance):
    weights = deepcopy(initial_weights)

    converged = False
    while not converged:
        output_estimate = predict_outcome(feature_matrix, weights)

        weight_update_change = weights
        derivative = -2 * np.dot(np.transpose(feature_matrix),output-output_estimate)
        weights = weights - step_size * derivative
        weight_update_change -= weights

        weight_update_change = np.squeeze(np.asarray(weight_update_change)) # Ensure that variables is read as an array
        weight_update_change_sum_squared = sum(weight_update_change**2)
        if weight_update_change_sum_squared < tolerance:
                converged = True
    return weights

In [57]:
data = deepcopy(data_train)
simple_features = ["sqft_living"]
my_output = "price"
(simple_feature_matrix, output) = get_numpy_data(data_train, simple_features, my_output)
initial_weights = np.array([[-47000.], [1.]])
step_size = 7e-12
tolerance = 2.5e7
simple_weights = regression_gradient_descent(simple_feature_matrix,
                                             output, initial_weights, step_size, tolerance)
simple_weights

array([[-46999.88716555],
       [   281.91211918]])

In [58]:
model_features = ["sqft_living", "sqft_living15"]
my_output = "price"
(feature_matrix, output) = get_numpy_data(data_train, model_features,my_output)
initial_weights = np.array([[-100000.], [1.], [1.]])
step_size = 4e-12
tolerance = 1e9
model_weights = regression_gradient_descent(feature_matrix,
                                             output, initial_weights, step_size, tolerance)
model_weights

array([[ -9.99999688e+04],
       [  2.45072603e+02],
       [  6.52795267e+01]])

# Question B1: What is the value of the weight for sqft_living from your gradient descent predicting house prices (model 1)? Round your answer to 1 decimal place.

In [59]:
"{:.1f}".format(np.squeeze(np.asarray(simple_weights))[1])

'281.9'

# Question B2: What is the predicted price for the 1st house in the TEST data set for model 1 (round to nearest dollar)?

In [60]:
data = deepcopy(data_test)
features = simple_features
my_output = "price"
(feature_matrix, output) = get_numpy_data(data,features,my_output)
weights = simple_weights
simple_predictions = np.squeeze(np.asarray(predict_outcome(feature_matrix, weights)))
"{:.0f}".format(simple_predictions[0])

'356134'

# Question B3: What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?

In [61]:
data = deepcopy(data_test)
features = model_features
my_output = "price"
(feature_matrix, output) = get_numpy_data(data,features,my_output)
weights = model_weights
model_predictions = np.squeeze(np.asarray(predict_outcome(feature_matrix, weights)))
"{:.0f}".format(model_predictions[0])

'366651'

# Question B4: Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2? 

In [62]:
simple_abs_diff = abs(data_test["price"][0]- simple_predictions[0])
model_abs_diff = abs(data_test["price"][0]- model_predictions[0])
print("simple_abs_diff: ", simple_abs_diff)
print("model_abs_diff: ", model_abs_diff)
if simple_abs_diff <= model_abs_diff:
    print("Model 1")
else:
    print("Model 2")

simple_abs_diff:  46134.443255
model_abs_diff:  56651.4116295
Model 1


# Question B5: Which model (1 or 2) has lowest RSS on all of the TEST data?

In [63]:
simple_resid = simple_predictions - data_test["price"]
simple_RSS = sum(simple_resid**2)
model_resid = model_predictions - data_test["price"]
model_RSS = sum(model_resid**2)
print("simple_RSS: ", simple_RSS)
print("model_RSS: ", model_RSS)
if simple_RSS <= model_RSS:
    print("Model 1")
else:
    print("Model 2")

simple_RSS:  2.75400044902e+14
model_RSS:  2.7026344363e+14
Model 2
