### Predicting Home Prices in Banglore-Kaggle Database

Dataset link: https://www.kaggle.com/amitabhajoy/bengaluru-house-price-data

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 

In [2]:
# Importing data-set
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape # dataset- shape

(13320, 9)

In [4]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [5]:
df['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [6]:
# Droping some columns
df1 = df.drop(['area_type','society','balcony','availability'],axis='columns')
df1.shape

(13320, 5)

#### Data Cleaning Section

In [7]:
# Missing Value NaN's values
df1.isnull().sum().sort_values(ascending=False) # missing_value_check_sorted_in_ascending_order

bath          73
size          16
location       1
price          0
total_sqft     0
dtype: int64

In [8]:
missing_values=df1.isnull().sum().sum() # missing_value in total dataset (in numbers)
missing_values/len(df1)*100 # missing values in total dataset (in percentage) 

0.6756756756756757

In [9]:
df2 = df1.dropna() # drop_missing_value(as in total missing values is less than one percent)
df2.isnull().sum() # missing_value_check_in_new_dataset

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [10]:
df2.shape 

(13246, 5)

# Feature Engineering-Section

### a- Feature Engineering- Size_feature

In [11]:
df2.head(5)

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [12]:
df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [13]:
# remove the string from the size_feature
df2['size'].apply(lambda x: int(x.split(' ')[0])).head(5)

0    2
1    4
2    3
3    3
4    2
Name: size, dtype: int64

In [14]:
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0])) # assiging the new feature that hold numeri value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
df2['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

### b- Feature Engineering- Size_feature

In [16]:
df2['total_sqft'].unique()
# treatment needed as feature holds some range values(1133 - 1384)

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [17]:
df2.info() # change the feature into float type first

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13246 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13246 non-null  object 
 1   size        13246 non-null  object 
 2   total_sqft  13246 non-null  object 
 3   bath        13246 non-null  float64
 4   price       13246 non-null  float64
 5   bhk         13246 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 724.4+ KB


In [18]:
# conversion function for the changes
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [19]:
df2[~df2['total_sqft'].apply(is_float)].head(10) # return the values are in the ranges

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [20]:
# treatment required to conver the range into the average values (((2100+2850)/2)=2475.0)

In [21]:
# function of the coversion 
def range_average(x):
    value=x.split('-') # split the range into two items 
    if len(value)==2: # if condtion holds two items 
        return (float(value[0])+float(value[1]))/2 # than conver them into average
    try:
        return float(x) # if holds one values than convert into float
    except:
        return None

In [22]:
#range_average("123")

In [23]:
#range_average("2100 - 2854")

In [24]:
#range_average("34.46Sq. Meter")

In [25]:
df3 = df2.copy() # create the deep copy()
df3.total_sqft = df3.total_sqft.apply(range_average)  # apply the conversion function 
df3 = df3[df3.total_sqft.notnull()] 
df3.head(2)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4


In [26]:
df3['total_sqft'].unique() # re-check as the range are removed from the the feature 

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

### c- Feature Engineering- Price_feature 

creating the new feature

In [27]:
df4 = df3.copy()
df4['price_per_sqft'] = df4['price']*100000/df4['total_sqft'] # conver the price into price_per_sqft
df4.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [28]:
# stats for new_feature
df4_stats = df4['price_per_sqft'].describe()
df4_stats

count    1.320000e+04
mean     7.920759e+03
std      1.067272e+05
min      2.678298e+02
25%      4.267701e+03
50%      5.438331e+03
75%      7.317073e+03
max      1.200000e+07
Name: price_per_sqft, dtype: float64

In [29]:
len(df4['location'].unique()) # Location feature unique values 

1298

In [30]:
df4.location = df4.location.apply(lambda x: x.strip()) # remove un-required spaces in the feature

In [31]:
location_stats = df4['location'].value_counts(ascending=False)
location_stats

Whitefield              533
Sarjapur  Road          392
Electronic City         304
Kanakpura Road          264
Thanisandra             235
                       ... 
Mullur                    1
Govindraja Nagar          1
Jagajyothi layout         1
2nd Block Hbr Layout      1
West of Chord Road        1
Name: location, Length: 1287, dtype: int64

### Dimension Reduction

Tagged the values to "other", if they are less than 10 in location_feautre

In [32]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

Ganga Nagar             10
Sector 1 HSR Layout     10
Thyagaraja Nagar        10
Dairy Circle            10
Nagappa Reddy Layout    10
                        ..
Mullur                   1
Govindraja Nagar         1
Jagajyothi layout        1
2nd Block Hbr Layout     1
West of Chord Road       1
Name: location, Length: 1047, dtype: int64

In [33]:
len(df4.location.unique())

1287

In [34]:
df4.location = df4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x) # assgined "other"
len(df4.location.unique()) 

241

In [35]:
df4.head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0
5,Whitefield,2 BHK,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,4 BHK,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,4 BHK,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,3 BHK,1310.0,3.0,63.25,3,4828.244275
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804


### Business-Domain_based_call

### 1-BHK related  total_sqft

In [36]:
# logic- ideally one bhk moves around 300 sqft
# check with the same logic in the dataset
df4[df4.total_sqft/df4.bhk<300].head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,8 Bedroom,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,6 Bedroom,1407.0,4.0,150.0,6,10660.98081
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,85.0,8,6296.296296
70,other,3 Bedroom,500.0,3.0,100.0,3,20000.0


In [37]:
df5 = df4[~(df4.total_sqft/df4.bhk<300)] # negate those values where the sqft values are less than 300 per Bedroom
df5.shape

(12456, 7)

In [38]:
df5.price_per_sqft.describe() # still there is gap between the max & min of the sqft area

count     12456.000000
mean       6308.502826
std        4168.127339
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [39]:
# outlier removal

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'): # grouping as per location & have the subdf as per the location 
        m = np.mean(subdf.price_per_sqft)  # mean for location 
        st = np.std(subdf.price_per_sqft) # std deviation for location 
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))] # condition for one std deviation 
        df_out = pd.concat([df_out,reduced_df],ignore_index=True) # create the new datafram 
    return df_out
df6 = remove_pps_outliers(df5)
df6.shape

(10242, 7)

In [40]:
df6.price_per_sqft.describe()

count    10242.000000
mean      5657.735993
std       2266.368844
min       1250.000000
25%       4244.796706
50%       5172.846776
75%       6426.000592
max      24509.803922
Name: price_per_sqft, dtype: float64

In [41]:
# bathroom feautre
df6.bath.unique()

array([ 4.,  3.,  2.,  5.,  8.,  1.,  6.,  7.,  9., 12., 16., 13.])

In [42]:
df6[df6.bath>10]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
5277,Neeladri Nagar,10 BHK,4000.0,12.0,160.0,10,4000.0
8483,other,10 BHK,12000.0,12.0,525.0,10,4375.0
8572,other,16 BHK,10000.0,16.0,550.0,16,5500.0
9306,other,11 BHK,6000.0,12.0,150.0,11,2500.0
9637,other,13 BHK,5425.0,13.0,275.0,13,5069.124424


In [43]:
df6[df6.bath>df6.bhk+2] # take the buisness call to remove the record holds BHK+2 bathroom 

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
1626,Chikkabanavar,4 Bedroom,2460.0,7.0,80.0,4,3252.03252
5238,Nagasandra,4 Bedroom,7000.0,8.0,450.0,4,6428.571429
6711,Thanisandra,3 BHK,1806.0,6.0,116.0,3,6423.03433
8408,other,6 BHK,11338.0,9.0,1000.0,6,8819.897689


In [44]:
df7 = df6[df6.bath<df6.bhk+2] # outlier remove
df7.shape

(10148, 7)

In [45]:
df8 = df7.drop(['size','price_per_sqft'],axis='columns')
df8.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3


### Use One Hot Encoding For Location

In [46]:
dummies = pd.get_dummies(df8.location)
dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
df9 = pd.concat([df8,dummies.drop('other',axis='columns')],axis='columns')
df9.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,428.0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,235.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,130.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,148.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df10 = df9.drop('location',axis='columns')
df10.head(3)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
df10.shape

(10148, 244)

### Model Building


In [50]:
X = df10.drop(['price'],axis='columns')
X.head(3)

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
X.shape

(10148, 243)

In [52]:
y = df10.price
y.head(3)

0    428.0
1    194.0
2    235.0
Name: price, dtype: float64

In [53]:
len(y)

10148

# Train_test using the Model

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

# model-LinearRegression

In [55]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.7783231781110631

### Score-1 (0.7783231781110631 )

# K-Fold using the Model

In [56]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.81567128, 0.87002069, 0.80226713, 0.82194915, 0.7501478 ])

In [57]:
score=([0.81567128, 0.87002069, 0.80226713, 0.82194915, 0.7501478 ]) # assgined the score

In [58]:
np.mean(score) # mean of the five score

0.81201121

### Score-2 (0.81201121)

In [61]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]