# Bangalore House Price Prediction

# CampusX

In [53]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

- bathroom column has a value with 40 bathrooms (may be outlier)
- location column has one null value
- size has 16 nulls
- society has 5502 nulls(very much) may be dropped
- bath has 73 nulls and balcony has 609 nulls

In [7]:
for col in df.columns:
    print(df[col].value_counts())
    print("-"*30)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
------------------------------
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
------------------------------
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
------------------------------
2 

# Observations:

- Availability 
- Location has many values which has occured only once (it'll create too many columns while one hot encoding, so will replace it with "other"
- size column has 2 types of values like BHK and Bedroom so needed to fix
- society column will be dropped
- total_sqft column has few values with range like(100-120), we'll take it's mean
- bathroom has a value like 40, may be an outlier

In [8]:
# dropping few unused columns

df.drop(columns = ['area_type','availability','society','balcony'], inplace = True)

In [9]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
df.location.value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [12]:
# Filling up missing  values

df['location'] = df['location'].fillna('Sarjapur  Road')

df['size'] = df['size'].fillna('2 BHK')

df['bath'] = df['bath'].fillna(df['bath'].median())

In [13]:
# total_sqft column-

def rangeColumn(x):
    temp = x.split("-")
    if(len(temp) == 2):
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x) # if the record is not range value i.e return the same value
    except:
        return None  # if any problem occurs while converting to float

In [14]:
df['total_sqft'] = df['total_sqft'].apply(rangeColumn)

In [15]:
df['total_sqft'].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [16]:
# handling size column

df['new_size_in_bhk'] = df['size'].str.split(" ").str.get(0).astype(int)

In [17]:
df['new_size_in_bhk'] # we have extracted integer values from size column and converted it into int type
# now we can drop previous size column 

0        2
1        4
2        3
3        3
4        2
        ..
13315    5
13316    4
13317    2
13318    4
13319    1
Name: new_size_in_bhk, Length: 13320, dtype: int32

In [18]:
df[df['new_size_in_bhk']>20]  # 2 records with more then 20 BHK(may be outliers)

Unnamed: 0,location,size,total_sqft,bath,price,new_size_in_bhk
1718,2Electronic City Phase II,27 BHK,8000.0,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400.0,40.0,660.0,43


In [19]:
df.sample(10)

Unnamed: 0,location,size,total_sqft,bath,price,new_size_in_bhk
5904,Electronic City,2 BHK,1070.0,2.0,52.0,2
3760,Electronic City,3 BHK,1571.0,3.0,105.0,3
326,Haralur Road,3 BHK,1464.0,3.0,56.0,3
1775,IVC Road,2 BHK,3817.0,2.0,124.0,2
12397,Raja Rajeshwari Nagar,3 BHK,1400.0,2.0,86.0,3
251,Anand Nagar,2 BHK,1060.0,2.0,55.0,2
8747,Sarjapur,4 Bedroom,3300.0,4.0,430.0,4
3831,Vijaya Bank Colony,3 BHK,1400.0,2.0,62.0,3
10443,Hennur Road,3 BHK,1904.0,3.0,129.0,3
283,Electronics City Phase 1,3 BHK,1490.0,3.0,78.8,3


# Price per square feet calculation- 
- it'll help us in removing outliers (where price per sqft is too high)

In [20]:
df['price_per_sqft'] = (df['price']*100000)/df['total_sqft']

In [21]:
df.describe()

Unnamed: 0,total_sqft,bath,price,new_size_in_bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


# price_per_sqft analysis
- mean is around 8000
- 5400 is median
- max value is too high(may be an outlier)


In [22]:
# Location column has too many distinct values( creates too many columns while OneHotEncoding)

df['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    400
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [23]:
df['location'] = df['location'].apply(lambda x: x.strip())  # removal of leading and trailing space
# strip() is an inbuilt function in Python programming language that returns a copy of the string with both leading and trailing characters removed (based on the string argument passed).
location_count = df['location'].value_counts()
location_count

Whitefield                        541
Sarjapur  Road                    400
Electronic City                   304
Kanakpura Road                    273
Thanisandra                       237
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1294, dtype: int64

In [24]:
# we'll keep only those location which has occured 10 or more times and replace all other values of location with "other"

location_count_less_10 = location_count[location_count<10]
location_count_less_10  # 1040 records

Vishwanatha Nagenahalli           9
Chennammana Kere                  9
2nd Phase JP Nagar                9
Jakkur Plantation                 9
B Narayanapura                    9
                                 ..
Bapuji Layout                     1
1st Stage Radha Krishna Layout    1
BEML Layout 5th stage             1
singapura paradise                1
Abshot Layout                     1
Name: location, Length: 1040, dtype: int64

In [25]:
df['location'] =df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [26]:
df['location'].value_counts() # now we are left with 255 diff values in location column only - that's cool

other                  2755
Whitefield              541
Sarjapur  Road          400
Electronic City         304
Kanakpura Road          273
                       ... 
BTM 1st Stage            10
Basapura                 10
Sector 1 HSR Layout      10
Kalkere                  10
Nagadevanahalli          10
Name: location, Length: 255, dtype: int64

# Outlier Detection and removal

In [27]:
df.describe()

Unnamed: 0,total_sqft,bath,price,new_size_in_bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


# Outliers
- total_sqft has a value with 1 sqft which is not possible obviously(outlier)


In [28]:
# df = df[df['total_sqft']>=500]  # 100 sqft se km me to kya hi ghr ya room bnega, so'll keep only those values where sqft>=100

In [29]:
df.shape

(13320, 7)

In [30]:
df

Unnamed: 0,location,size,total_sqft,bath,price,new_size_in_bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,other,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


# An alternate

- we'll check sqft per bhk means 1 bhk me around kitne sqft hone chaiye and when we do it we see a value =0 but obviously 0 sqft me to ghr ni bnega to remove it

In [31]:
(df['total_sqft']/df['new_size_in_bhk']).describe()
df= df[((df['total_sqft']/df['new_size_in_bhk']) >= 300 )]
df.describe()

Unnamed: 0,total_sqft,bath,price,new_size_in_bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [32]:
df.shape

(12530, 7)

In [33]:
# Outlier detection in price_per_sqft

df['price_per_sqft'].describe()

# 176470 is the max value of price_per_sqft (outlier - as it is too too high)

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [34]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
    
        st = np.std(subdf.price_per_sqft)
    
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output , gen_df] , ignore_index=True)
    return df_output

df = remove_outliers_sqft(df)
df.describe()


# outliers managed

Unnamed: 0,total_sqft,bath,price,new_size_in_bhk,price_per_sqft
count,10282.0,10282.0,10282.0,10282.0,10282.0
mean,1509.996956,2.474032,91.718675,2.576347,5669.007067
std,883.210082,0.986908,88.327858,0.900592,2292.794131
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4249.917219
50%,1286.0,2.0,67.0,2.0,5176.565008
75%,1650.0,3.0,100.0,3.0,6431.808627
max,30400.0,16.0,2200.0,16.0,24509.803922


In [35]:
df['new_size_in_bhk'].describe()

count    10282.000000
mean         2.576347
std          0.900592
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max         16.000000
Name: new_size_in_bhk, dtype: float64

In [36]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby("location"):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('new_size_in_bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std'  : np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
            
        for bhk, bhk_df in location_df.groupby('new_size_in_bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
                
    return df.drop(exclude_indices, axis='index')

In [37]:
df = bhk_outlier_remover(df)

In [38]:
df.shape

(7401, 7)

In [39]:
df.drop(columns = ['price_per_sqft' , 'size'] , inplace=True)

# price_per_sqft was only created to detect outlier

In [40]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,new_size_in_bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [41]:
df.to_csv('cleaned bengaluru price predict data.csv')

# Model Building

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [43]:
x = df.drop(columns = ['price'])
y=df['price']

In [44]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=0)

In [45]:
print(x_train.shape , x_test.shape)

(5550, 4) (1851, 4)


# Linear Regression

In [46]:
colum_trans = make_column_transformer((OneHotEncoder(sparse = False),['location']) , remainder='passthrough')
# apply OneHotEncoder on Location(categorical column) 
# sparse = False


scaler = StandardScaler()

lr = LinearRegression(normalize = True)

pipe = make_pipeline(colum_trans, scaler, lr)
# data in pipeline will first go through column_trans then scaled then fit Linear regression


pipe.fit(x_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [47]:
y_predict_lr = pipe.predict(x_test)

In [48]:
r2_score(y_test, y_predict_lr)

0.8774425387697149

# Applying Lasso

In [49]:
lasso = Lasso()
pipe = make_pipeline(colum_trans, scaler, lasso)
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [50]:
y_predict_lasso = pipe.predict(x_test)

r2_score(y_test, y_predict_lasso)

0.8588678522592751

# Applying Ridge

In [51]:
ridge = Ridge()
pipe = make_pipeline(colum_trans, scaler, ridge)
pipe.fit(x_train, y_train)

y_predict_ridge = pipe.predict(x_test)

r2_score(y_test, y_predict_ridge)

0.8774307430379548

In [52]:
print("Linear Regression : ", r2_score(y_test, y_predict_lr))
print("Lasso Regression : ", r2_score(y_test, y_predict_lasso))
print("Ridge Regression : ", r2_score(y_test, y_predict_ridge))


Linear Regression :  0.8774425387697149
Lasso Regression :  0.8588678522592751
Ridge Regression :  0.8774307430379548
