In [2]:
### multivariate linear regression

import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
## import model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

boston_dataset = load_boston()

## build a DataFrame
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
boston['MEDV'] = boston_dataset.target

## data preparation
## feature selection
X = boston[['RM','LSTAT']] #multible variables
y = boston['MEDV']

## train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

## instantiate
model = LinearRegression()

## fit
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [3]:
# show intercept and coefficients 
print(model.intercept_)
print(model.coef_) # outputs array, order matches order of features, so 5 is RM coeffiecent, -.67 is LSTAT coefficent
# model is  Y = M1(X1) + M2(X2) + B
# therefore MEDV = (5)(RM) + (-.67)(LSTAT) + (-.23)

-0.22886344819943716
[ 4.99734079 -0.67359875]


In [5]:
## predict 
y_test_predictions = model.predict(X_test)

## evaluating model performance 
print(mean_squared_error(y_test, y_test_predictions)) # MSE (mean squared error)
print(mean_squared_error(y_test, y_test_predictions)**.5) # RMSE (root mean squared error)
print(abs(y_test - y_test_predictions).mean()) # MAE (mean absolute error)
print(model.score(X_test, y_test)) # R-score
# lower MSE, RMSE, MAE & higher R-score than the previous univariate regression RM vs MEDV  = better model! Usally more variables = better model but don't go overboard
print()

29.816277731842458
5.46042834691954
4.0880457454485155
0.5998518447715597



In [6]:
## plugging in an imaginary house
new_house = [[8,35]] # house with 8 rooms in neighborhood with 35% of population lower status 
print(model.predict(new_house)) # model says it should cost ~$16,173
print((model.coef_[0]*8)+(model.coef_[1]*35) + model.intercept_) # plug in coefs and intercept, same answer
print((4.99734079 * 8) + (-0.67359875 * 35) + (-0.22886344819943716)) #plug in numbers manually, same answer

# code and comments by github.com/alandavidgrunberg


[16.17390652]
16.173906523799022
16.173906621800562
