# Multiple Regression (Interpretation)

In [1]:
import pandas as pd 
import numpy as np 
from math import sqrt 
from sklearn.model_selection import train_test_split
from sklearn import linear_model

## 1. Load Data 

In [4]:
housedata = pd.read_csv('kc_house_data.csv')

In [16]:
housedata.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
#split the dataset into training and testing 
X_train, X_test = train_test_split(housedata, test_size=0.2, train_size = 0.8, random_state=0)

In [6]:
X_train.shape

(17290, 21)

In [7]:
X_test.shape

(4323, 21)

## Learning a multiple regression model 

In [8]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
#create a Linear Regression model 
regr = linear_model.LinearRegression()

In [9]:
#prepare the training data 
x_training = housedata.drop('price', axis = 1)
x_training = housedata[['sqft_living','bedrooms','bathrooms']]

In [10]:
#fit the training feature 
regr.fit(x_training, housedata.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
regr.intercept_

74847.140801283764

In [12]:
# regression weights on ['sqft_living', 'bedrooms', 'bathrooms']
regr.coef_

array([   309.39239013, -57860.8943206 ,   7932.71222266])

In [13]:
#predict on index(0) in testset 
regr.predict(X_test[['sqft_living','bedrooms','bathrooms']])[0]

413455.53837957664

In [33]:
regr.residues_



1436301587370045.0

In [35]:
regr.score(X_test[['sqft_living','bedrooms','bathrooms']], X_test['price'])

0.49709155015245687

## Create some new features 

In [14]:
from math import log 

In [24]:
X_train.loc[:,'bedrooms_squared'] = X_train['bedrooms'].apply(lambda x: x**2)
X_test.loc[:,'bedrooms_squared'] = X_test['bedrooms'].apply(lambda x: x**2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [26]:
# X_test.drop(['bedrooms_squared'], axis=1)
X_test.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared
17384,1453602313,20141029T000000,297000.0,2,1.5,1430,1650,3.0,0,0,...,1430,0,1999,0,98125,47.7222,-122.29,1430,1650,4
722,2225059214,20140808T000000,1578000.0,4,3.25,4670,51836,2.0,0,0,...,4670,0,1988,0,98005,47.635,-122.164,4230,41075,16
2680,2768000270,20140625T000000,562100.0,2,0.75,1440,3700,1.0,0,0,...,1200,240,1914,0,98107,47.6707,-122.364,1440,4300,4
18754,6819100040,20140624T000000,631500.0,2,1.0,1130,2640,1.0,0,0,...,1130,0,1927,0,98109,47.6438,-122.357,1680,3200,4
14554,4027700666,20150426T000000,780000.0,4,2.5,3180,9603,2.0,0,2,...,3180,0,2002,0,98155,47.7717,-122.277,2440,15261,16


In [27]:
X_train['bed_bath_rooms'] = X_train['bedrooms'] * X_train['bathrooms']
X_test['bed_bath_rooms'] = X_test['bedrooms'] * X_test['bathrooms']
X_train['log_sqft_living'] = X_train['sqft_living'].apply(lambda x: log(x))
X_test['log_sqft_living'] = X_test['sqft_living'].apply(lambda x: log(x))
X_train['lat_plus_long'] = X_train['lat'] + X_train['long']
X_test['lat_plus_long'] = X_test['lat'] + X_test['long']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

## Learn and Compare multiple Models 

In [28]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [40]:
x_training1 = X_train[model_1_features]
x_training2 = X_train[model_2_features]
x_training3 = X_train[model_3_features]

In [43]:
regr.fit(x_training1, X_train.price)
regr.coef_
# pd.DataFrame(zip(x_training1.columns, regr.coef_), columns =['features', 'Coefficients'])

array([  3.12942010e+02,  -5.30962691e+04,   1.47770428e+04,
         6.53983343e+05,  -3.25707336e+05])

In [44]:
regr.residues_



979843597588329.38

In [45]:
regr.fit(x_training2, X_train.price)
regr.residues_



970799199729577.5

In [46]:
regr.fit(x_training3, X_train.price)
regr.residues_



array([], dtype=float64)

# Multiple Regression (gradient descent)

Need to convert Panda Dataframe to numpy array which enable matrix operations 

Predicted value = feature (vector/matrix) * weight (vector)

In [75]:
def get_numpy_data(dataFrame, features, output):
    dataFrame['constant'] = 1 
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features 
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_frame = dataFrame[features]
    # the following line will convert the features_SFrame into a numpy array:
    feature_matrix = features_frame.values
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = dataFrame[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.values
    return(feature_matrix, output_array)

In [76]:
(example_features, example_output) = get_numpy_data(housedata, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print (example_features[0,:])# this accesses the first row of the data the ':' indicates 'all columns'
print (example_output[0]) 

[   1 1180]
221900.0


## Predicting output given regression weights

In [77]:
my_weights = np.array([1., 1.]) 
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print (predicted_value)

1181.0


In [78]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [81]:
test_predictions = predict_output(example_features, my_weights)
test_predictions
print (test_predictions[0]) # should be 1181.0
print (test_predictions[1]) # should be 2571.0

1181.0
2571.0


## Computing the Derivative

In [82]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2*np.dot(errors, feature)
    return(derivative)

In [84]:
(example_features, example_output) = get_numpy_data(housedata, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print (derivative)
print (-np.sum(example_output)*2) # should be the same as derivative

-23345850016.0
-23345850016.0


## Gradient Descent

In [85]:
from math import sqrt

In [86]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    gradient_magnitude = 0
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            drivative = feature_derivative(errors, feature_matrix[:, i])
            # add the squared value of the derivative to the gradient magnitude (for assessing convergence)
            gradient_sum_squares += drivative * drivative
            # subtract the step size times the derivative from the current weight
            weights[i] -= step_size * drivative
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

## Running the Gradient Descent as Simple Regression

In [87]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(X_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [89]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size,tolerance)
print (simple_weights)

[-46999.88720259    283.46383063]


In [90]:
(test_simple_feature_matrix, test_output) = get_numpy_data(X_test, simple_features, my_output)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [91]:
test_predictions = predict_output(test_simple_feature_matrix, simple_weights)

In [96]:
test_predictions

array([  358353.39059137,  1276776.20181685,   361188.02889762, ...,
         338510.92244761,   222290.7518913 ,   417880.79502265])

In [117]:
X_test['price'].tolist()[1]

1578000.0

In [118]:
rss = 0
for i in range(0, len(test_predictions)):
    error = test_predictions[i] - X_test['price'].tolist()[i]
    rss += error * error
print (rss)

2.67729995271e+14


## Running a multiple regression

In [121]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(X_train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [123]:
simple_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)
print (simple_weights)

[ -9.99999757e+04   2.47055837e+02   6.47974873e+01]


In [124]:
(test_simple_feature_matrix, test_output) = get_numpy_data(X_test, model_features, my_output)
test_predictions = predict_output(test_simple_feature_matrix, simple_weights)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [125]:
print (test_predictions[0])

345950.278091


In [134]:
rss = 0
for i in range(0, len(test_predictions)):
    error = test_predictions[i] - X_test['price'].tolist()[i]
    rss += error * error
print (rss)

2.62684098597e+14
