## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [4]:
from sklearn.datasets import load_boston
boston = load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [5]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [6]:
df_boston = pd.DataFrame(boston.data, columns = boston.feature_names)
df_boston['price'] = pd.Series(boston.target)
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [7]:
df_boston.shape

(506, 14)

## Check for missing values

In [8]:
df_boston.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
price      0
dtype: int64

## Divide the data into X and y

In [9]:
X = df_boston.drop('price', axis = 1)
y = df_boston['price']

## Divide the data into train data and test data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train.shape, X_test.shape

((379, 13), (127, 13))

## Perform Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit_transform(X_train)

array([[-0.3906002 ,  0.42637011, -0.74491444, ...,  0.27878885,
         0.34049624,  0.82212111],
       [-0.40127639,  0.5525335 , -0.84901832, ...,  0.55287698,
         0.42774893, -0.46241699],
       [-0.40110543,  1.18335044, -0.66648002, ..., -0.40643148,
         0.34184377, -0.90310809],
       ...,
       [-0.3954927 , -0.49882807, -0.15309105, ..., -0.31506877,
         0.40091059, -0.31227617],
       [-0.38599992, -0.49882807, -0.59517599, ..., -0.26938741,
         0.38103449,  0.86938766],
       [-0.39692832, -0.49882807, -1.003035  , ..., -0.86324503,
         0.42774893,  0.29801844]])

In [12]:
ss.fit_transform(X_test)

array([[-0.43519802, -0.45439168, -1.26783377, ..., -0.69912236,
         0.25657172, -0.81096958],
       [ 0.52696116, -0.45439168,  1.07087019, ...,  0.88810506,
         0.1632353 , -0.49560953],
       [-0.43026285, -0.45439168,  0.40828986, ..., -0.93961137,
         0.44746838, -0.35592951],
       ...,
       [-0.40557651, -0.45439168, -0.21966632, ...,  0.02234465,
         0.47322347, -0.43224952],
       [-0.43191735, -0.45439168,  2.25910807, ...,  0.35902925,
         0.2995312 ,  0.66359066],
       [-0.43438021,  1.06362787, -1.43465922, ...,  0.02234465,
         0.48002281, -0.9362496 ]])

## Apply KNN Regressor on train data

In [15]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors = 21, metric = 'minkowski', p = 2)
model.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=21)

## Perform prediction on test data

In [18]:
y_pred = model.predict(X_test)
y_pred

array([23.73333333, 21.73809524, 27.57619048, 11.92380952, 23.44761905,
       24.20952381, 21.01904762, 26.17619048, 25.73809524, 17.04761905,
       11.53333333, 11.53333333, 14.83333333, 11.8       , 23.80952381,
       27.3952381 , 20.24761905, 25.07619048, 26.03809524, 25.31428571,
       24.44285714, 23.71428571, 23.72380952, 28.26666667, 24.38095238,
       11.86666667, 21.07619048, 21.72857143, 24.07619048, 17.72380952,
       19.18571429, 17.65238095, 24.67142857, 22.73333333, 22.32380952,
       20.8       , 11.92380952, 20.82857143, 11.78571429, 14.37142857,
       26.22380952, 24.38095238, 23.37142857, 20.1952381 , 23.2       ,
       26.81904762, 22.92857143, 23.99047619, 20.58095238, 24.04285714,
       17.4952381 , 23.31904762, 24.42857143, 24.58095238, 22.34285714,
       24.48095238, 24.03333333, 22.72380952, 11.53333333, 25.18095238,
       26.63333333, 24.5       , 24.51904762, 26.27619048, 20.32857143,
       26.08571429, 20.25238095, 20.26190476, 12.92380952, 25.1 

## Comparision between the prediction and test set results

In [20]:
y_test = y_test.values
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[23.73333333 22.6       ]
 [21.73809524 50.        ]
 [27.57619048 23.        ]
 [11.92380952  8.3       ]
 [23.44761905 21.2       ]
 [24.20952381 19.9       ]
 [21.01904762 20.6       ]
 [26.17619048 18.7       ]
 [25.73809524 16.1       ]
 [17.04761905 18.6       ]
 [11.53333333  8.8       ]
 [11.53333333 17.2       ]
 [14.83333333 14.9       ]
 [11.8        10.5       ]
 [23.80952381 50.        ]
 [27.3952381  29.        ]
 [20.24761905 23.        ]
 [25.07619048 33.3       ]
 [26.03809524 29.4       ]
 [25.31428571 21.        ]
 [24.44285714 23.8       ]
 [23.71428571 19.1       ]
 [23.72380952 20.4       ]
 [28.26666667 29.1       ]
 [24.38095238 19.3       ]
 [11.86666667 23.1       ]
 [21.07619048 19.6       ]
 [21.72857143 19.4       ]
 [24.07619048 38.7       ]
 [17.72380952 18.7       ]
 [19.18571429 14.6       ]
 [17.65238095 20.        ]
 [24.67142857 20.5       ]
 [22.73333333 20.1       ]
 [22.32380952 23.6       ]
 [20.8        16.8       ]
 [11.92380952  5.6       ]
 