# Step 1: Problem statement

- To predict the house price in boston city of Untied States given with total number of rooms per house, age of house, nox level in the city, crime rate in the city, etc

- To understand the column names meanings use the link: https://www.kaggle.com/c/boston-housing

# Step 2 : Load the necessary libraries and load the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("BostonHousing.csv")
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [3]:
data.shape

(506, 14)

# Step 3: Data Cleaning, Data Wrangling and Data Preprocessing 

In [4]:
data.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

# Step 4 : Seperate X and y

In [5]:
X = data.drop('medv', axis = 1)
y = data['medv']

# Step 5: Split X and y into train and test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Step 6: Apply KNN Regression on X_train and y_train

In [7]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr

In [8]:
knr.fit(X_train, y_train)

# Step 7: Perform predictions on X_test

In [9]:
y_pred = knr.predict(X_test)
y_pred

array([23.34, 29.54, 23.08, 10.76, 20.42, 21.4 , 22.96, 25.02, 29.32,
       18.26, 11.08, 13.4 , 16.76,  8.76, 38.84, 25.34, 21.98, 23.44,
       24.32, 27.28, 23.56, 20.32, 19.02, 29.18, 21.16, 12.64, 18.48,
       24.38, 24.02, 17.4 , 16.04, 20.16, 22.32, 25.  , 24.6 , 19.04,
        8.76, 20.08, 13.14, 14.7 , 25.14, 21.16, 20.18, 19.04, 20.2 ,
       24.12, 23.26, 21.6 , 18.38, 20.04, 18.26, 20.78, 29.64, 24.34,
       22.64, 22.46, 23.4 , 19.58, 10.48, 22.3 , 32.88, 20.72, 25.48,
       28.04, 22.52, 32.88, 19.04, 18.06, 13.9 , 29.34, 31.16, 22.22,
       24.72, 30.94, 22.52, 11.64, 32.32, 22.84, 25.04, 20.46, 25.22,
       18.22, 16.38, 32.68, 28.02, 29.34, 23.8 , 13.56, 32.24, 20.56,
       32.46, 12.64, 22.74, 25.02, 24.04, 22.28, 10.68, 25.06, 12.8 ,
       22.72, 29.42, 19.56, 26.5 , 28.04, 25.36, 20.08, 12.64, 16.76,
       22.8 , 25.48, 32.24, 16.76, 18.06, 19.66, 16.86, 22.12, 12.28,
       17.86, 11.96, 40.1 , 32.88, 11.48, 24.3 , 21.44, 23.04, 19.04,
       29.42, 19.62,

In [10]:
X_test

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
329,0.06724,0.0,3.24,0,0.460,6.333,17.2,5.2146,4,430,16.9,375.21,7.34
371,9.23230,0.0,18.10,0,0.631,6.216,100.0,1.1691,24,666,20.2,366.15,9.53
219,0.11425,0.0,13.89,1,0.550,6.373,92.4,3.3633,5,276,16.4,393.74,10.50
403,24.80170,0.0,18.10,0,0.693,5.349,96.0,1.7028,24,666,20.2,396.90,19.77
78,0.05646,0.0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.40,12.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
428,7.36711,0.0,18.10,0,0.679,6.193,78.1,1.9356,24,666,20.2,96.73,21.52
385,16.81180,0.0,18.10,0,0.700,5.277,98.1,1.4261,24,666,20.2,396.90,30.81
308,0.49298,0.0,9.90,0,0.544,6.635,82.5,3.3175,4,304,18.4,396.90,4.54


# Step 8: Compare y_test and y_pred for accuracy

In [12]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5166480241894266

In [26]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor(n_neighbors = 3, p = 1)
knr

In [27]:
knr.fit(X_train, y_train)

In [28]:
y_pred_3 = knr.predict(X_test)

In [29]:
r2_score(y_test, y_pred_3)

0.5898239317936042