In [21]:
import pandas as pd
from copy import deepcopy
from collections import Counter
import random

import numpy as np
import statsmodels.api as sm

# EDA: Exploratory Data Analysis

In [22]:
df = pd.read_csv('Bengaluru_House_Data.csv')

In [23]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [24]:
df.dropna(subset=['size'], inplace = True)

In [25]:
df['size_num'] = [float(i.split()[0]) for i in list(df['size'].values)]

In [26]:
df['availability'].unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       '19-May', '17-Nov', '20-Oct', '20-Jun', '19-Feb', '21-Oct',
       '21-Jan', '17-Mar', '17-Apr', '22-May', '19-Oct', '21-Jul',
       '21-Nov', '21-Mar', '16-Dec', '22-Mar', '20-Jan', '21-Sep',
       '21-Aug', '14-Nov', '19-Nov', '15-Nov', '16-Jul', '15-Jun',
       '17-Feb', '20-Nov', '20-Jul', '16-Sep', '15-Oct', '15-Dec',
       '16-Oct', '22-Nov', '15-Aug', '17-Jan', '16-Nov', '20-Apr',
       '16-Jan', '14-Jul'], dtype=object)

In [27]:
df['availability_num'] = [1 if x == 'Ready To Move' else 0 for x in list(df['availability'].values)]

In [28]:
df['society_num'] = [1 if str(x) != 'nan' else 0 for x in list(df['society'].values)]

In [29]:
Counter(df['society_num'].values)

Counter({1: 7805, 0: 5499})

In [30]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,size_num,availability_num,society_num
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,2.0,0,1
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,4.0,1,1
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0,3.0,1,0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,3.0,1,1
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0,2.0,1,0


In [31]:
req_cols = ['total_sqft', 'bath', 'balcony', 'price', 'size_num', 'availability_num', 'society_num']

In [32]:
df = df.loc[df['total_sqft'].str.isnumeric()]
print(df.shape)

(12978, 12)


# Data Cleaning for "total_sqft"

In [13]:
df['total_sqft'] = [x.split(" ")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [x.split(".")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [x.split("P")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [x.split("S")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [x.split("A")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [x.split("C")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [x.split("G")[0] for x in list(df['total_sqft'].values)]
df['total_sqft'] = [float(x) for x in list(df['total_sqft'].values)]

In [14]:
df_2 = df[req_cols]

In [15]:
print("Before dropping nulls:", df_2.shape[0])
df_2.dropna(inplace = True)
print("After dropping nulls:", df_2.shape[0])

Before dropping nulls: 12978
After dropping nulls: 12439


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2.dropna(inplace = True)


In [16]:
inx = list(range(df_2.shape[0]))

In [17]:
random.shuffle(inx)

In [18]:
df_train = df_2.iloc[inx[:round(len(inx) * 0.8)]]

In [123]:
df_test = df_2.iloc[inx[round(len(inx) * 0.8):]]

In [124]:
print(df_train.shape)
print(df_test.shape)

(9951, 7)
(2488, 7)


In [125]:
train_cols = ['total_sqft', 'bath', 'balcony', 'size_num', 'availability_num', 'society_num']

# LinearRegression from sklearn.linear_model 

In [126]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(df_train[train_cols], df_train['price'])
print(reg.score(df_train[train_cols], df_train['price']))
print(reg.coef_)
print(reg.intercept_)

0.39669894089032764
[  0.0556212   30.04993502  -0.70047741  -2.77368246   0.1812153
 -10.52677754]
-42.39508432022454


In [127]:
reg.predict(df_test[train_cols])

array([276.15026388, 212.72760966, 108.98275056, ...,  86.33045425,
        53.95151987,  71.19409146])

# SGDRegressor from sklearn.linear_model

In [129]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty=None, eta0=0.1)
sgd_reg.fit(df_train[train_cols], df_train['price'])

print(sgd_reg.intercept_, sgd_reg.coef_)

[5.13437301e+12] [5.73756797e+12 8.93653775e+12 6.69762700e+12 1.00933118e+13
 4.48916723e+12 3.99501913e+12]


In [130]:
sgd_reg.predict(df_test[train_cols])

array([2.19085941e+16, 1.84571168e+16, 8.39114173e+15, ...,
       8.06962990e+15, 5.50906546e+15, 7.28771153e+15])

# SVR (Support Vector Regression) from sklearn.svm

In [132]:
%time
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(df_train[train_cols], df_train['price'])

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.82 µs


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [133]:
regr.predict(df_test[train_cols])

array([220.92429802, 184.23748477,  91.07650626, ...,  59.08687972,
        43.38189998,  59.85174899])

# DecisionTreeRegressor from sklearn.tree

In [136]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0).fit(df_train[train_cols], df_train['price'])

In [137]:
regressor.predict(df_test[train_cols])

array([240.        , 165.        , 100.        , ...,  62.78      ,
        48.65071429,  73.458     ])