In [3]:
import pandas as pd
import numpy as np

In [6]:
house_data = pd.read_csv("housePrice/kc_house_data.csv",
                         dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
                                  'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
                                  'price':float, 'bedrooms':float, 'zipcode':str, 
                                  'long':float, 'sqft_lot15':float, 'sqft_living':float, 
                                  'floors':str, 'condition':int, 'lat':float, 'date':str,
                                  'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 
                                  'view':int})
train_data = pd.read_csv("housePrice/kc_house_train_data.csv",
                         dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
                                  'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
                                  'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
                                  'sqft_lot15':float, 'sqft_living':float, 'floors':str, 
                                  'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
                                  'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})
test_data = pd.read_csv("housePrice/kc_house_test_data.csv",
                        dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
                                 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
                                 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
                                 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 
                                 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str,
                                 'sqft_lot':int, 'view':int})

In [13]:
price = house_data['price']

In [24]:
price.mean()

540088.1417665294

# Build a generic simple linear regression function 

In [26]:
def simple_linear_regression(input_feature,output):
    numerator = (input_feature * output).mean(axis=0) - (output.mean(axis=0))*(input_feature.mean(axis=0))
    denominator = (input_feature**2).mean(axis=0) - input_feature.mean(axis=0) * input_feature.mean(axis=0)
    slope = numerator/denominator
    intercept = output.mean(axis=0) - slope * (input_feature.mean(axis=0))
    return (intercept, slope)

Run some tests on the simple_linear_regression model 

In [32]:
test_feature = pd.Series(range(5))
test_output = pd.Series(3+5*test_feature)
(test_intercept, test_slope) = simple_linear_regression(test_feature,test_output)
print ('Intercept:' + str(test_intercept))
print ('Slope:' + str(test_slope))

Intercept:3.0
Slope:5.0


Now we know the simple regression worked, let train our training data.

In [33]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])
print ('Intercept:' + str(sqft_intercept))
print ('Slope:' + str(sqft_slope))

Intercept:-47116.07907289488
Slope:281.95883963034294


## Predicting Values
convert panda dataframe to numpy array 

In [34]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_values = intercept + slope * input_feature
    return predicted_values

In [35]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print ("The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price))

The estimated price for a house with 2650 squarefeet is $700074.85


# Residual Sum of Squares

rewrite function using lambda

In [39]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # First get the predictions
    predictions = input_feature.apply(lambda x:get_regression_predictions(x, intercept, slope))
    diff = np.subtract(predictions,output)
    # square the residuals and add them up
    RSS = np.vdot(diff,diff)
    return(RSS)

In [42]:
get_residual_sum_of_squares(test_feature,test_output,test_intercept,test_slope) #test pass when return 0 

0.0

In [55]:
#compute the RSS on training data
rss_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print ('The RSS of predicting Prices based on Square Feet is : ' + str(rss_on_sqft))

The RSS of predicting Prices based on Square Feet is : 1.20191835418e+15


# Predict the squarefeet given price

In [47]:
#inverse prediction, predict squarefoot given the price? 
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept)/slope
    return(estimated_input)

In [48]:
my_house_price = 800000
print (inverse_regression_predictions(my_house_price,sqft_intercept,sqft_slope))

3004.3962451522752


# Which feature has the lowest RSS on TEST data 

In [54]:
#Estimate the slope and intercept for feature 'bedroom'
sqft_bed_intercept, sqft_bed_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])
print ("bedroom Intercept: "+ str(sqft_bed_intercept))
print ("bedroom Slope: " + str(sqft_bed_slope))

sqft_living Intercept: 109473.17762295791
sqft_living Slope: 127588.95293398833


In [50]:
# def get_residual_sum_of_squares(input_feature, output, intercept,slope):
#     RSS = (((intercept + input_feature*slope) - output)**2).sum(axis=0)
#     return(RSS)

In [57]:
#which model has better RSS on test data 
RSS_sqf = get_residual_sum_of_squares(test_data['sqft_living'],test_data['price'],sqft_intercept,sqft_slope)
RSS_bed = get_residual_sum_of_squares(test_data['bedrooms'],test_data['price'],sqft_bed_intercept,sqft_bed_intercept)
print ('The RSS of predicting Prices based on Bedrooms is : ' + str(RSS_bed))
print ('The RSS of predicting Prices based on Square Feet is : ' + str(RSS_sqf))
print (RSS_sqf - RSS_bed)

The RSS of predicting Prices based on Bedrooms is : 5.08067591494e+14
The RSS of predicting Prices based on Square Feet is : 2.75402933618e+14
-2.32664657876e+14
