In [1]:
import pandas as pd
import numpy as np

In [43]:
house_data = pd.read_csv("housePrice/kc_house_data.csv",
                         dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
                                  'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
                                  'price':float, 'bedrooms':float, 'zipcode':str, 
                                  'long':float, 'sqft_lot15':float, 'sqft_living':float, 
                                  'floors':str, 'condition':int, 'lat':float, 'date':str,
                                  'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 
                                  'view':int})
train_data = pd.read_csv("housePrice/kc_house_train_data.csv",
                         dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
                                  'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
                                  'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
                                  'sqft_lot15':float, 'sqft_living':float, 'floors':str, 
                                  'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
                                  'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})
test_data = pd.read_csv("housePrice/kc_house_test_data.csv",
                        dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
                                 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
                                 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
                                 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 
                                 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str,
                                 'sqft_lot':int, 'view':int})

# Build a generic simple linear regression function 

In [9]:
def simple_linear_regression(input_feature,output):
    numerator = (input_feature * output).mean(axis=0) - (output.mean(axis=0))*(input_feature.mean(axis=0))
    denominator = (input_feature**2).mean(axis=0) - input_feature.mean(axis=0) * input_feature.mean(axis=0)
    slope = numerator/denominator
    intercept = output.mean(axis=0) - slope * (input_feature.mean(axis=0))
    return (intercept, slope)

convert panda dataframe to numpy array 

In [10]:
sqft_living = train_data['sqft_living']

In [11]:
sqft_living_list = [i for i in train_data['sqft_living']]

In [14]:
sqft_living_array = np.array(sqft_living_list)

In [16]:
price_list = [m for m in train_data['price']]
price_list_array = np.array(price_list)

In [17]:
intercept_train,slope_train = simple_linear_regression(sqft_living_array, price_list_array)

In [20]:
print ("Intercept: "+ str(intercept_train))
print ("Slope: " + str(slope_train))

Intercept: -47116.0790729
Slope: 281.95883963


# Predicting Values

In [21]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + input_feature * slope
    return(predicted_output)


In [23]:
input_feature = 2650
print (get_regression_predictions(2650, intercept_train, slope_train))

700074.845948


# Residual Sum of Squares

In [32]:
#def get_residual_sum_of_squares(input_feature, output, intercept,slope):
#    RSS = (((intercept + input_feature*slope) - output)**2).sum(axis=0)
#    return(RSS)
#print (get_residual_sum_of_squares(sqft_living_list,price_list_array,intercept_train,slope_train))

rewrite function using lambda

In [29]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # First get the predictions
    predictions = input_feature.apply(lambda x:get_regression_predictions(x, intercept, slope))
    # then compute the residuals (since we are squaring it doesn't matter which order you subtract)
    diff = np.subtract(predictions,output)
    # square the residuals and add them up
    RSS = np.vdot(diff,diff)
    return(RSS)

In [31]:
print (get_residual_sum_of_squares(sqft_living,price_list_array,intercept_train,slope_train))

1.20191835418e+15


# Predict the squarefeet given price

In [39]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept)/slope
    return(estimated_input)

In [42]:
my_house_price = 800000
print (inverse_regression_predictions(my_house_price,intercept_train,slope_train))
#print （"The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)）

3004.39624515


# Which feature has the lowest RSS on TEST data 

In [44]:
sqft_living_array_test = np.array([a for a in test_data['sqft_living']])
bedrooms_array_test = np.array([b for b in test_data['bedrooms']])
price_array_test = np.array([c for c in test_data['price']])

In [45]:
intercept_sqf,slope_sqf = simple_linear_regression(sqft_living_array_test,price_array_test)

In [46]:
print ("sqft_living Intercept: "+ str(intercept_train))
print ("sqft_living Slope: " + str(slope_train))

sqft_living Intercept: -47116.0790729
sqft_living Slope: 281.95883963


In [47]:
intercept_sqf,slope_sqf = simple_linear_regression(bedrooms_array_test,price_array_test)
print ("bedrooms Intercept: "+ str(intercept_train))
print ("bedrooms Slope: " + str(slope_train))

bedrooms Intercept: -47116.0790729
bedrooms Slope: 281.95883963


In [50]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    RSS = (((intercept + input_feature*slope) - output)**2).sum(axis=0)
    return(RSS)

In [51]:
intercept_br, slope_br = simple_linear_regression(bedrooms_array_test,price_array_test)
RSS_sqf = get_residual_sum_of_squares(sqft_living_array_test,price_array_test,intercept_sqf,slope_sqf)
RSS_br = get_residual_sum_of_squares(bedrooms_array_test,price_array_test,intercept_br,slope_br)
print (RSS_sqf - RSS_br)

2.28515324205e+20
