In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

In [5]:
boston = load_boston()

In [6]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [7]:
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [8]:
df_boston = pd.DataFrame(boston['data'],columns=boston['feature_names'])
df_boston['MEDV'] = pd.Series(boston['target'])

In [9]:
df_boston.info()
df_boston.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(14)
memory usage: 55.4 KB


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [15]:
df_boston.columns
Xb = df_boston[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT']]
yb = df_boston['MEDV']
Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, test_size=0.4, random_state=101)

In [16]:
lm = LinearRegression()

In [17]:
lm.fit(Xb_train,yb_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
lm.coef_

array([ -7.75583711e-02,   4.20310157e-02,   9.11529473e-02,
         4.13304932e+00,  -1.99765575e+01,   2.89019042e+00,
         1.61533256e-02,  -1.26474745e+00,   2.60170760e-01,
        -1.11251993e-02,  -8.80555502e-01,   7.02445445e-03,
        -6.43482813e-01])

In [19]:
clm = pd.DataFrame(lm.coef_,index=Xb_train.columns, columns=['Coeff'])

In [20]:
clm

Unnamed: 0,Coeff
CRIM,-0.077558
ZN,0.042031
INDUS,0.091153
CHAS,4.133049
NOX,-19.976557
RM,2.89019
AGE,0.016153
DIS,-1.264747
RAD,0.260171
TAX,-0.011125


In [21]:
# This time, I am going to use the matrices instead of a dataframe
Xb2 = boston['data']
yb2 = boston['target']
Xb2_train, Xb2_test, yb2_train, yb2_test = train_test_split(Xb2, yb2, test_size=0.4, random_state=101)

In [22]:
lm2 = LinearRegression()
lm2.fit(Xb2_train,yb2_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
# Coefficients are the same as above with the dataframe
lm2.coef_

array([ -7.75583711e-02,   4.20310157e-02,   9.11529473e-02,
         4.13304932e+00,  -1.99765575e+01,   2.89019042e+00,
         1.61533256e-02,  -1.26474745e+00,   2.60170760e-01,
        -1.11251993e-02,  -8.80555502e-01,   7.02445445e-03,
        -6.43482813e-01])