In [1]:
import pandas as pd

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

# Simple linear regression function

In [11]:
def simple_linear_regression(input_feature, output):
    # compute the sum of input_feature and output
    # compute the product of the output and the input_feature and its sum
    # compute the squared value of the input_feature and its sum
    # use the formula for the slope
    # use the formula for the intercept
    
    n = len(output)
    sum_input = input_feature.sum()
    sum_output = output.sum()
    sum_prod_input_output = (input_feature * output).sum()
    sum_sq_input = (input_feature * input_feature).sum()
    
    slope_nume = sum_prod_input_output - (sum_input * sum_output) / n
    slope_deno = sum_sq_input - (sum_input * sum_input) / n
    slope = slope_nume / slope_deno
    
    intercept = (sum_output / n) - slope * (sum_input / n)
    
    return (intercept, slope)

In [12]:
test_feature = pd.Series(range(5))
test_output = pd.Series(1 + 1*test_feature)
(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)
print "Intercept: " + str(test_intercept)
print "Slope: " + str(test_slope)

Intercept: 1
Slope: 1


In [13]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])

print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.0790729
Slope: 281.95883963


# Predicting Values

In [16]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + slope * input_feature
    return predicted_output

In [18]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price)

The estimated price for a house with 2650 squarefeet is $700074.85


# Residual Sum of Squares

In [19]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # First get the predictions
    # then compute the residuals (since we are squaring it doesn't matter which order you subtract)
    # square the residuals and add them up
    
    pred_output = get_regression_predictions(input_feature, intercept, slope)
    residuals = output - pred_output
    RSS = (residuals * residuals).sum()

    return(RSS)

In [20]:
get_residual_sum_of_squares(test_feature, test_output, test_intercept, test_slope)

0

In [21]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1.20191835418e+15


# Predict the squarefeet given price

In [22]:
def inverse_regression_predictions(output, intercept, slope):
    # solve output = intercept + slope*input_feature for input_feature. Use this equation to compute the inverse predictions:
    estimated_feature = (output - intercept) / slope
    return estimated_feature

In [23]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3004


# New Model: estimate prices from bedrooms

In [24]:
bedroom_intercept, bedroom_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])

print "Intercept: " + str(bedroom_intercept)
print "Slope: " + str(bedroom_slope)

Intercept: 109473.177623
Slope: 127588.952934


In [26]:
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)
rss_prices_on_bedrooms = get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedroom_intercept, bedroom_slope)
print 'The RSS of predicting Prices based on # Bedrooms is : ' + str(rss_prices_on_bedrooms)

The RSS of predicting Prices based on Square Feet is : 2.75402933618e+14
The RSS of predicting Prices based on # Bedrooms is : 4.9336458596e+14
