# House Price Prediction

## Problem statement

The real estate market is influenced by numerous factors such as location, size, number of bedrooms and bathrooms, neighborhood amenities, and economic conditions. Predicting house prices accurately is essential for various stakeholders including homebuyers, sellers, real estate agents, and investors.

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# house pricing dataset
df = pd.read_csv('ParisHousing.csv')

In [3]:
df.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


## 2. Data Preprocessing

In [4]:
df.shape

(10000, 17)

In [5]:
df.describe()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,49870.1312,50.3584,0.5087,0.4968,50.2763,50225.4861,5.5101,5.5217,2005.4885,0.4991,0.4999,5033.1039,5028.0106,553.1212,0.503,4.9946,4993448.0
std,28774.37535,28.816696,0.499949,0.500015,28.889171,29006.675799,2.872024,2.856667,9.30809,0.500024,0.500025,2876.729545,2894.33221,262.05017,0.500016,3.17641,2877424.0
min,89.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1990.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,10313.5
25%,25098.5,25.0,0.0,0.0,25.0,24693.75,3.0,3.0,1997.0,0.0,0.0,2559.75,2512.0,327.75,0.0,2.0,2516402.0
50%,50105.5,50.0,1.0,0.0,50.0,50693.0,5.0,5.0,2005.5,0.0,0.0,5092.5,5045.0,554.0,1.0,5.0,5016180.0
75%,74609.75,75.0,1.0,1.0,76.0,75683.25,8.0,8.0,2014.0,1.0,1.0,7511.25,7540.5,777.25,1.0,8.0,7469092.0
max,99999.0,100.0,1.0,1.0,100.0,99953.0,10.0,10.0,2021.0,1.0,1.0,10000.0,10000.0,1000.0,1.0,10.0,10006770.0


In [6]:
df.isnull().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
dtype: int64

## 3. Split into feature and target

In [7]:
## Dividing the dataset into independent and dependent feature
feature = df.iloc[:,:-1]
target = df.iloc[:,-1]

In [8]:
feature.shape

(10000, 16)

In [9]:
target.shape

(10000,)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

std_data = scaler.fit_transform(feature)
std_data

array([[ 0.89156241, -1.64351831, -1.01755405, ...,  1.53748782,
        -1.00601811,  0.63137326],
       [ 1.07395603, -0.39418009,  0.98274878, ..., -1.62237047,
         0.99401789, -0.9428096 ],
       [ 0.20303346,  0.26519286, -1.01755405, ..., -1.59565669,
         0.99401789,  1.26104641],
       ...,
       [ 1.18065352, -1.64351831, -1.01755405, ..., -0.79424336,
         0.99401789,  1.26104641],
       [ 0.31855868,  0.68163893, -1.01755405, ..., -0.81714088,
         0.99401789, -0.31313646],
       [-1.68318347,  1.16749268, -1.01755405, ..., -1.04993237,
         0.99401789,  0.31653669]])

In [11]:
feature = std_data

## 4. Train test split

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(feature,target,test_size=0.2,random_state=42)

In [13]:
print(X_train.shape,y_train.shape)

(8000, 16) (8000,)


In [14]:
print(X_test.shape,y_test.shape)

(2000, 16) (2000,)


## 5. Training Model

In [15]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [16]:
lin_reg.fit(X_train,y_train)

## 6. Model Evaluation

In [17]:
from sklearn.metrics import r2_score
y_pred = lin_reg.predict(X_test)
r2_score(y_pred,y_test)

0.9999995780585409

## 7. Predictive System

In [22]:
input = np.array([4.9795000e+04, 1.9000000e+01, 0.0000000e+00, 0.0000000e+00,
       4.3000000e+01, 9.6569000e+04, 1.0000000e+01, 1.0000000e+00,
       2.0130000e+03, 1.0000000e+00, 0.0000000e+00, 8.5200000e+02,
       9.6040000e+03, 6.1900000e+02, 0.0000000e+00, 8.0000000e+00,]).reshape(1,-1)
std_input = scaler.transform(input)
price = lin_reg.predict(std_input)
price[0]



4982639.157273832

In [48]:
df.iloc[6767].values

array([4.9795000e+04, 1.9000000e+01, 0.0000000e+00, 0.0000000e+00,
       4.3000000e+01, 9.6569000e+04, 1.0000000e+01, 1.0000000e+00,
       2.0130000e+03, 1.0000000e+00, 0.0000000e+00, 8.5200000e+02,
       9.6040000e+03, 6.1900000e+02, 0.0000000e+00, 8.0000000e+00,
       4.9832823e+06])

In [19]:
df.iloc[3421].values

array([1.6218000e+04, 6.6000000e+01, 1.0000000e+00, 1.0000000e+00,
       7.0000000e+01, 4.4999000e+04, 5.0000000e+00, 1.0000000e+01,
       2.0150000e+03, 1.0000000e+00, 1.0000000e+00, 6.6910000e+03,
       5.4800000e+02, 2.2000000e+02, 1.0000000e+00, 8.0000000e+00,
       1.6301485e+06])

In [21]:
input = np.array([1.6218000e+04, 6.6000000e+01, 1.0000000e+00, 1.0000000e+00,
       7.0000000e+01, 4.4999000e+04, 5.0000000e+00, 1.0000000e+01,
       2.0150000e+03, 1.0000000e+00, 1.0000000e+00, 6.6910000e+03,
       5.4800000e+02, 2.2000000e+02, 1.0000000e+00, 8.0000000e+00]).reshape(1,-1)
std_input = scaler.transform(input)
price = lin_reg.predict(std_input)
price[0]



1632347.3903009188