In [4]:
from sklearn import datasets
data = datasets.load_boston()
print(data.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [5]:
data.data, data.feature_names

(array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'))

In [6]:
import pandas as pd
df = pd.DataFrame(data.data, columns=data.feature_names)

In [7]:
target = pd.DataFrame(data.target, columns=["MEDV"])

In [8]:
target

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
5,28.7
6,22.9
7,27.1
8,16.5
9,18.9


In [9]:
import statsmodels.api as sm

In [10]:
x = df[["RM"]]
y = target["MEDV"]

In [12]:
model = sm.OLS(y, x, hasconst = False).fit()

In [14]:
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.901
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Mon, 30 Jul 2018",Prob (F-statistic):,3.7399999999999996e-256
Time:,21:04:37,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [15]:
y_pred=model.predict(x)

In [16]:
compare = pd.DataFrame({"y":y, "y_pred":y_pred})

In [17]:
compare

Unnamed: 0,y,y_pred
0,24.0,24.020779
1,21.6,23.458163
2,34.7,26.249323
3,33.4,25.566146
4,36.2,26.110495
5,28.7,23.491043
6,22.9,21.963943
7,27.1,22.548479
8,16.5,20.572016
9,18.9,21.934716


In [18]:
compare[100:120]

Unnamed: 0,y,y_pred
100,27.5,24.576088
101,26.5,24.773369
102,18.6,23.399709
103,19.3,22.420611
104,20.1,22.530212
105,19.5,21.375753
106,19.5,21.320953
107,20.4,22.384078
108,19.8,23.65179
109,19.4,22.75672
