In [13]:
from sklearn.datasets import load_boston
import pandas as pd
from sklearn import linear_model

In [14]:
boston = load_boston()

In [15]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [16]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [39]:
# define the data/predictors as the pre-set feature names  
df = pd.DataFrame(boston.data, columns=boston.feature_names)

# Put the target (housing value -- MEDV) in another DataFrame
target = pd.DataFrame(boston.target, columns=["MEDV"])
X = df
y = target['MEDV']
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
predictions = lm.predict(X)
print(predictions[0:5])
print("R Squared = {}".format(lm.score(X,y))) # This is the RÂ² score of our model. As you probably remember, this the percentage of explained variance of the predictions.
print("y-intercept = {}".format(lm.intercept_))

[30.00821269 25.0298606  30.5702317  28.60814055 27.94288232]
R Squared = 0.7406077428649427
y-intercept = 36.49110328036191


lm.fit() -> fits a linear model

lm.predict() -> Predict Y using the linear model with estimated coefficients

lm.score() -> Returns the coefficient of determination (R^2). A measure of how well observed outcomes are replicated by the model, as the proportion of total variation of outcomes explained by the model. 

.coef_  gives the coefficients and .intercept_  gives the estimated intercepts.

In [41]:
## Without a constant

import statsmodels.api as sm

X = df["RM"]
y = target["MEDV"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

  from pandas.core import datetools


0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.901
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Sun, 13 May 2018",Prob (F-statistic):,3.7399999999999996e-256
Time:,16:03:50,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [42]:
X = df["RM"] ## X usually means our input variables (or independent variables)
y = target["MEDV"] ## Y usually means our output/dependent variable
X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model

# Note the difference in argument order
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,471.8
Date:,"Sun, 13 May 2018",Prob (F-statistic):,2.49e-74
Time:,16:04:39,Log-Likelihood:,-1673.1
No. Observations:,506,AIC:,3350.0
Df Residuals:,504,BIC:,3359.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-34.6706,2.650,-13.084,0.000,-39.877,-29.465
RM,9.1021,0.419,21.722,0.000,8.279,9.925

0,1,2,3
Omnibus:,102.585,Durbin-Watson:,0.684
Prob(Omnibus):,0.0,Jarque-Bera (JB):,612.449
Skew:,0.726,Prob(JB):,1.02e-133
Kurtosis:,8.19,Cond. No.,58.4


In [44]:
X = df[['RM', 'LSTAT']]
y = target['MEDV']

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,4637.0
Date:,"Sun, 13 May 2018",Prob (F-statistic):,0.0
Time:,16:05:34,Log-Likelihood:,-1582.9
No. Observations:,506,AIC:,3170.0
Df Residuals:,504,BIC:,3178.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,4.9069,0.070,69.906,0.000,4.769,5.045
LSTAT,-0.6557,0.031,-21.458,0.000,-0.716,-0.596

0,1,2,3
Omnibus:,145.153,Durbin-Watson:,0.834
Prob(Omnibus):,0.0,Jarque-Bera (JB):,442.157
Skew:,1.351,Prob(JB):,9.7e-97
Kurtosis:,6.698,Cond. No.,4.72


In [45]:
X = df
y = target['MEDV']

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,891.1
Date:,"Sun, 13 May 2018",Prob (F-statistic):,0.0
Time:,16:06:41,Log-Likelihood:,-1523.8
No. Observations:,506,AIC:,3074.0
Df Residuals:,493,BIC:,3129.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.0916,0.034,-2.675,0.008,-0.159,-0.024
ZN,0.0487,0.014,3.379,0.001,0.020,0.077
INDUS,-0.0038,0.064,-0.059,0.953,-0.130,0.123
CHAS,2.8564,0.904,3.160,0.002,1.080,4.633
NOX,-2.8808,3.359,-0.858,0.392,-9.481,3.720
RM,5.9252,0.309,19.168,0.000,5.318,6.533
AGE,-0.0072,0.014,-0.523,0.601,-0.034,0.020
DIS,-0.9680,0.196,-4.947,0.000,-1.352,-0.584
RAD,0.1704,0.067,2.554,0.011,0.039,0.302

0,1,2,3
Omnibus:,204.05,Durbin-Watson:,0.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1372.527
Skew:,1.609,Prob(JB):,9.11e-299
Kurtosis:,10.399,Cond. No.,8500.0
