In [1]:
from sklearn.datasets import load_boston # This is Boston house price data set.

boston_bunch = load_boston()
print(type(boston_bunch))

<class 'sklearn.utils.Bunch'>


**NOTE** : All the data sets present within **Scikit learn** library are of **Bunch** datatype. 

In [2]:
boston_bunch.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

- All the feature of the dataset is present in the datakey.
- The target is present in the target key.
- The column names are present the feature_names key.
- The description is present in the 'DESCR' key.

In [3]:
print(boston_bunch.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

## Explore the Data

In [4]:
import pandas as pd
boston_df = pd.DataFrame(boston_bunch.data, columns = boston_bunch.feature_names)

boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [5]:
boston_df["MEDV"] = boston_bunch.target
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
boston_df.shape

(506, 14)

In [7]:
boston_df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


**Because the number of columns are more and we have to scroll sideways so we are transpose this**

In [8]:
boston_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677083,88.9762
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


**Identify target variable in your dataset**
- Target variable is **MEDV**

**Split the data into X and y**

X holds all the features

y holds all the target

In [9]:
# Splitting X and y from the bunch object

X = boston_bunch.data
y = boston_bunch.target

In [10]:
# Splitting X and y from the dataframe

X = boston_df.drop("MEDV", axis = 1)
y = boston_df["MEDV"]

**Split the dataset into Train set and Test set**

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# test size = (0,1) 0 - denotes that entire data has been given to training
# 1 - denotes entire data has been given to testing
# 0.2% - 20% of data is taken for testing

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


**Building regression Model**

In [14]:
from sklearn.linear_model import LinearRegression # Import the necessary model function

model_lr = LinearRegression() # Instantiate the estimator object
                              # Estimator object is model_lr

In [15]:
# Training the model with the data of this problem statement
model_lr.fit(X_train, y_train)  # Supervised learning model

LinearRegression()

In [16]:
print("The intercept for the LR model is ", model_lr.intercept_)

The intercept for the LR model is  34.02915528720698


In [17]:
print("The regression co-efficient values for all the features are ", model_lr.coef_)

The regression co-efficient values for all the features are  [-1.36076909e-01  3.75513234e-02  5.37275160e-02  1.53675148e+00
 -2.14348854e+01  4.30415617e+00  1.04601410e-02 -1.37732316e+00
  3.12976291e-01 -1.20211581e-02 -9.76929145e-01  9.64176522e-03
 -4.95507144e-01]


Because the train and test data sets are taken randomly so to control the randomness add **random_state = ...** while splitting the data set.

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

In [30]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

In [31]:
model_lr.fit(X_train, y_train)

LinearRegression()

In [32]:
print("The intercept for the LR model is ", model_lr.intercept_)

The intercept for the LR model is  40.653176529790514


**Evaluate the regression Model**

In [33]:
y_test_pred = model_lr.predict(X_test)

In [34]:
pd.DataFrame({'Actual y_test': y_test, 'Predicted y_test': y_test_pred})

Unnamed: 0,Actual y_test,Predicted y_test
455,14.1,15.311568
142,13.4,15.324187
311,22.1,26.890855
232,41.7,37.384876
290,28.5,33.375220
...,...,...
486,19.1,20.027912
468,19.1,17.513802
302,26.4,29.172278
244,17.6,16.904621


In [35]:
from sklearn.metrics import mean_squared_error
import numpy as np

print("RMS value of testing dataset")
print(np.sqrt(mean_squared_error(y_test, y_test_pred)))

RMS value of testing dataset
5.179324335658004


In [36]:
# Figuring out R2 score - coefficient of determination

print("R2 score is {}".format(model_lr.score(X_test, y_test)))

R2 score is 0.714936416139223


This score tells us that our model was able to fit 71% of data.