## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [3]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
df_boston = pd.DataFrame(boston.data, columns = boston.feature_names)
df_boston['price'] = pd.Series(boston.target)
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
df_boston.shape

(506, 14)

## Check for missing values

In [6]:
df_boston.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
price      0
dtype: int64

## Divide the data into X and y

In [7]:
X = df_boston.drop('price', axis = 1)
y = df_boston['price']

## Divide the data into train data and test data

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
X_train.shape, X_test.shape

((354, 13), (152, 13))

## Perform Feature Scaling

In [44]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit_transform(X_train)

array([[-0.42243455,  0.69964561,  0.61683909, ..., -0.12393825,
         0.4320693 , -0.92844216],
       [-0.41632593,  0.78458201, -0.88748801, ..., -0.8644301 ,
         0.24895288, -0.90705663],
       [-0.41084876, -0.48946401, -0.13160456, ..., -0.30906121,
        -0.19238798,  0.37750094],
       ...,
       [-0.40393688,  0.44483641, -0.74910777, ...,  0.29258841,
         0.35414742,  0.81804288],
       [-0.42172117,  2.90799204, -0.88451209, ...,  0.33886915,
         0.43823812, -1.33904436],
       [ 0.50176483, -0.48946401,  1.07215472, ...,  0.80167655,
         0.43823812,  1.15878568]])

In [45]:
ss.fit_transform(X_test)

array([[-0.26168504, -0.48383697,  1.43066898, ...,  1.27944251,
         0.44823387, -0.0313518 ],
       [-0.31346705, -0.48383697, -0.50003345, ...,  1.18716481,
         0.34682389,  0.04158913],
       [-0.20451336, -0.48383697,  1.10631097, ..., -1.71958288,
         0.17546819,  0.21718767],
       ...,
       [ 0.19519517, -0.48383697,  0.89849718, ...,  0.81805399,
         0.20625336, -1.24703404],
       [-0.30980242, -0.48383697, -0.50003345, ...,  1.18716481,
        -0.77309995, -0.10834501],
       [-0.40564507,  1.93824509, -1.11224164, ..., -0.38155617,
         0.44823387, -0.71753689]])

## Apply KNN Regressor on train data

In [46]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors = 21, metric = 'minkowski', p = 2)
model.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=21)

## Perform prediction on test data

In [47]:
y_pred = model.predict(X_test)
y_pred

array([20.91428571, 23.01428571, 23.6047619 , 23.6047619 , 23.10952381,
       13.11904762, 12.35714286, 18.00952381, 25.92857143, 20.34285714,
       26.07619048, 26.20952381, 25.96666667, 27.17142857, 23.46190476,
       26.9047619 , 19.52857143, 28.3       , 20.        , 22.88571429,
       26.41428571, 26.75238095, 29.17619048, 20.68095238, 26.31428571,
       23.0952381 , 25.75238095, 17.57142857, 27.70952381, 20.91428571,
       25.33333333, 27.15238095, 31.22380952, 15.51428571, 26.80952381,
       17.16190476, 31.17142857, 24.1       , 22.56666667, 21.86190476,
       25.08095238, 12.35714286, 19.56190476, 16.11904762, 27.54761905,
       22.56666667, 16.81428571, 22.86190476, 12.32380952, 23.6047619 ,
       23.02380952, 20.91428571, 25.34761905, 24.34761905, 20.74285714,
       19.70952381, 12.94761905, 27.39047619, 21.88095238, 24.4047619 ,
       28.18095238, 23.79047619, 24.05238095, 25.19047619, 24.93809524,
       11.53809524, 29.77142857, 22.22857143, 23.5       , 22.16

## Comparision between the prediction and test set results

In [48]:
y_test = y_test.values
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[20.91428571 19.6       ]
 [23.01428571 18.4       ]
 [23.6047619  21.5       ]
 [23.6047619  15.3       ]
 [23.10952381 14.6       ]
 [13.11904762 27.5       ]
 [12.35714286  8.8       ]
 [18.00952381 14.9       ]
 [25.92857143 19.5       ]
 [20.34285714 23.2       ]
 [26.07619048 25.2       ]
 [26.20952381 21.9       ]
 [25.96666667 29.8       ]
 [27.17142857 36.2       ]
 [23.46190476 21.2       ]
 [26.9047619  22.        ]
 [19.52857143 19.9       ]
 [28.3        31.5       ]
 [20.         18.3       ]
 [22.88571429 19.3       ]
 [26.41428571 35.2       ]
 [26.75238095 20.7       ]
 [29.17619048 16.6       ]
 [20.68095238 19.1       ]
 [26.31428571 23.7       ]
 [23.0952381  22.6       ]
 [25.75238095 33.2       ]
 [17.57142857 15.2       ]
 [27.70952381 27.5       ]
 [20.91428571 14.4       ]
 [25.33333333 15.7       ]
 [27.15238095 21.6       ]
 [31.22380952 34.9       ]
 [15.51428571 13.1       ]
 [26.80952381 17.6       ]
 [17.16190476 13.8       ]
 [31.17142857 20.1       ]
 

## Evaluation Metrics

In [49]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_pred, y_test, squared = True)
mse

51.0968000358038

In [50]:
rmse = np.sqrt(mse)
rmse

7.1482025737806145

In [51]:
rmse = mean_squared_error(y_pred, y_test, squared = False)
rmse

7.1482025737806145

In [52]:
from sklearn.metrics import r2_score
r2 = r2_score(y_pred, y_test)
r2

-1.2691364274365688

In [54]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_pred, y_test)
mae

5.0548872180451125