## Step 1 : Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Step 2: Load the data

In [7]:
data = pd.read_csv(r'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


The Boston Housing Dataset

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:  

- CRIM - per capita crime rate by town
- ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS - proportion of non-retail business acres per town.
- CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
- NOX - nitric oxides concentration (parts per 10 million)
- RM - average number of rooms per dwelling
- AGE - proportion of owner-occupied units built prior to 1940
- DIS - weighted distances to five Boston employment centres
- RAD - index of accessibility to radial highways
- TAX - full-value property-tax rate per $10,000
- PTRATIO - pupil-teacher ratio by town
- B  1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT - % lower status of the population
- MEDV - Median value of owner-occupied homes in $1000's

## Step 3 : Data preprocessing 

In [8]:
data.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

## Step 4 : Seperate X and y

In [9]:
X = data.drop('medv', axis = 1)
y = data['medv']

## Step 5 : Divide the data into train test split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

## Step 6 : Fit the KNNRegressor on X_train and y_train

In [13]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn

In [15]:
knn.fit(X_train, y_train)

## Step 7: Perform Predictions

In [18]:
y_pred = knn.predict(X_test)
y_pred

array([24.54, 29.34, 14.6 , 28.96, 16.74, 31.88, 19.74, 16.06, 18.02,
       21.18, 23.38, 20.04, 12.28, 21.6 , 23.7 , 22.22, 19.04, 14.06,
       35.14, 10.6 , 25.86, 29.34, 16.78, 21.08, 18.88, 23.16, 22.92,
       12.34, 23.38, 21.44, 22.68, 23.42, 10.3 , 30.04, 17.54, 21.12,
       21.66, 26.28, 22.52, 27.82, 21.18, 30.46, 38.16, 22.4 , 24.84,
       12.6 , 16.78, 28.96, 19.8 , 21.56, 22.44, 34.7 , 17.96, 21.18,
       30.36, 21.38, 12.5 , 35.14, 21.74, 20.6 , 25.3 , 38.82, 28.96,
       16.26, 30.46, 23.38, 12.26, 25.3 , 35.04, 12.34, 20.46, 22.52,
       14.82, 27.6 , 21.4 ,  8.98, 19.68, 38.16, 10.94, 15.04, 22.4 ,
       15.96, 26.3 , 12.24, 21.46, 34.  , 14.16, 22.58, 26.04, 17.86,
       23.24, 15.58, 17.48, 21.7 , 24.76, 17.86, 30.16, 10.76, 10.76,
       11.92, 21.8 , 21.4 , 23.16, 21.28, 22.22, 10.88, 23.96, 24.56,
       22.9 , 25.48,  9.98, 15.62, 23.28, 33.04, 38.6 , 13.46, 32.72,
       17.2 , 21.6 , 22.44, 24.78, 31.62,  9.78, 20.  , 23.8 , 23.7 ,
       25.5 ])

In [17]:
X_test

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
173,0.09178,0.0,4.05,0,0.510,6.416,84.1,2.6463,5,296,16.6,395.50,9.04
274,0.05644,40.0,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.90,3.53
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07
72,0.09164,0.0,10.81,0,0.413,6.065,7.8,5.2873,4,305,19.2,390.91,5.52
452,5.09017,0.0,18.10,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,73.53410,0.0,18.10,0,0.679,5.957,100.0,1.8026,24,666,20.2,16.45,20.62
117,0.15098,0.0,10.01,0,0.547,6.021,82.6,2.7474,6,432,17.8,394.51,10.30
42,0.14150,0.0,6.91,0,0.448,6.169,6.6,5.7209,3,233,17.9,383.37,5.81
322,0.35114,0.0,7.38,0,0.493,6.041,49.9,4.7211,5,287,19.6,396.90,7.70


## Step 8 : Evaluation:

- For Regression problems r2_score is used as the accuracy metrics

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.639665439953224

#### Observations:

- The KNeighborsRegressor was able to perform 63.9% of correct values of houses

## Fit the KNeighborsRegressor with different value of k

In [28]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors = 3)
knn

In [29]:
knn.fit(X_train, y_train)

In [30]:
y_pred_k9 = knn.predict(X_test)

In [31]:
r2_score(y_pred_k9, y_test)

0.6201354132151049