In [11]:
from pandas import DataFrame, read_csv
import pandas

In [14]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [19]:
train_data = pandas.read_csv("kc_house_train_data.csv", dtype = dtype_dict)
test_data = pandas.read_csv("kc_house_test_data.csv", dtype = dtype_dict)

In [20]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [52]:
def simple_linear_regression(input_feature, output):
    meanX = input_feature.sum() / len(input_feature)
    meanY = output.sum() / len(output)
    sumXiYi = (input_feature * output).sum()
    SumXsq = (input_feature * input_feature).sum()
    ProductXiYiMean = (input_feature.sum() * output.sum() / len(input_feature))
    SqSumXMean = input_feature.sum() * input_feature.sum() / len(input_feature)
    
    slope = (sumXiYi - ProductXiYiMean) / (SumXsq - SqSumXMean)
    intercept = meanY - meanX * slope
    return(intercept, slope)

In [55]:
input_feature = train_data["sqft_living"]
output = train_data["price"]

In [59]:
print "sqft_living -> price model's (intercept, slope) is " + str(simple_linear_regression(input_feature, output))

sqft_living -> price model's (intercept, slope) is (-47116.079072894179, 281.9588396303426)


In [63]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + input_feature * slope
    return predicted_output

In [64]:
print "price for a house with 2650 sqft is " + str(get_regression_predictions(2650, -47116.079072894179, 281.9588396303426))

price for a house with 2650 sqft is 700074.845948


In [73]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predictions = get_regression_predictions(input_feature, intercept, slope)
    diff = output - predictions
    RSS = (diff*diff).sum()
    return RSS

In [74]:
print "the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data is " + str(get_residual_sum_of_squares(input_feature, output, -47116.079072894179, 281.9588396303426))

the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data is 1.20191835418e+15


In [75]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept) / slope
    return(estimated_input)

In [76]:
print "the estimated square-feet for a house costing $800,000 is " + str(inverse_regression_predictions(800000, -47116.079072894179, 281.9588396303426))

the estimated square-feet for a house costing $800,000 is 3004.39624515


In [80]:
input_feature_bedroom = train_data["bedrooms"]
(intercept_bedroom, slope_bedroom) = simple_linear_regression(input_feature_bedroom, output)
RSS_train_bedroom = get_residual_sum_of_squares(input_feature_bedroom, output, intercept_bedroom, slope_bedroom)
print("RSS_train_bedroom is " + str(RSS_train_bedroom))

(intercept_sqft, slope_sqft) = simple_linear_regression(input_feature, output)
RSS_train_sqft = get_residual_sum_of_squares(input_feature, output, intercept_sqft, slope_sqft)
print("RSS_train_sqft is " + str(RSS_train_sqft))

test_feature_bedroom = test_data["bedrooms"]
test_feature_sqft = test_data["sqft_living"]
test_output = test_data["price"]
RSS_test_bedroom = get_residual_sum_of_squares(test_feature_bedroom, test_output, intercept_bedroom, slope_bedroom)
RSS_test_sqft = get_residual_sum_of_squares(test_feature_sqft, test_output, intercept_sqft, slope_sqft)
print("RSS_test_bedroom is " + str(RSS_test_bedroom))
print("RSS_test_sqft is " + str(RSS_test_sqft))

RSS_train_bedroom is 2.14324449816e+15
RSS_train_sqft is 1.20191835418e+15
RSS_test_bedroom is 4.9336458596e+14
RSS_test_sqft is 2.75402933618e+14
