#### Read Data

In [2]:
import sframe
sales = sframe.SFrame('Data1/')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1503743127.log


#### Train Test Split

In [3]:
train_data, test_data = sales.random_split(.8,seed=0)

#### Preprocessing

In [5]:
prices = sales['price']
sum_prices = prices.sum()
num_houses = prices.size()
avg_price_1 = sum_prices / float(num_houses)
avg_price_2 = prices.mean()
print "average price via method 1: " + str(avg_price_1)
print "average price via method 2: " + str(avg_price_2)

average price via method 1: 540088.141905
average price via method 2: 540088.141905


In [6]:
half_prices = 0.5*prices
prices_squared = prices*prices
sum_prices_squared = prices_squared.sum()
print "the sum of price squared is: " + str(sum_prices_squared)

the sum of price squared is: 9.21732513355e+15


#### Build a Simple Linear Regression Function

In [14]:
def simple_linear_regression(input_feature, output):
    sum_input_feature = input_feature.sum()
    sum_output = output.sum()
    size = float(output.size())
    
    product_sum = (input_feature*output).sum()
    
    squared_sum_input_feature = (input_feature*input_feature).sum()
    squared_sum_output = (output*output).sum()
    
    slope = (product_sum - (sum_input_feature*sum_output) / size) / (squared_sum_input_feature - (sum_input_feature * sum_input_feature)/ size)
    intercept = sum_output / size - slope * sum_input_feature / size
    
    return (intercept, slope)

In [15]:
test_feature = sframe.SArray(range(5))
test_output = sframe.SArray(1 + 1*test_feature)
(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)
print "Intercept: " + str(test_intercept)
print "Slope: " + str(test_slope)

Intercept: 1.0
Slope: 1.0


In [16]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])

print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.0765749
Slope: 281.958838568


#### Predicting Values

In [17]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_values = input_feature*slope + intercept
    return predicted_values

In [18]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price)

The estimated price for a house with 2650 squarefeet is $700074.85


#### Residual Sum of Square

In [23]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predictions = get_regression_predictions(input_feature,intercept,slope)
    residuals = output - predictions
    RSS = (residuals*residuals).sum()
    return RSS

In [24]:
print get_residual_sum_of_squares(test_feature, test_output, test_intercept, test_slope)

0.0


In [25]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1.20191835632e+15


#### Predict Squared Feet Given Price

In [26]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_feature = (output - intercept) / float(slope)
    return estimated_feature

In [27]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3004


#### New model (Using Bedrooms to estimate price)

In [28]:
sqft_intercept_2, sqft_slope_2 = simple_linear_regression(train_data['sqft_living'], train_data['bedrooms'])

print "Intercept: " + str(sqft_intercept_2)
print "Slope: " + str(sqft_slope_2)

Intercept: 2.1601681049
Slope: 0.000581335275125


In [31]:
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 2.75402936247e+14


In [30]:
rss_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['bedrooms'], sqft_intercept_2, sqft_slope_2)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 3184.55673602
