In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('data/datasets_20710_26737_Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
# Data cleaning
df1 = df.drop(['area_type','society','balcony','availability'],axis='columns')
df1.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [4]:
df1.isnull().sum()
df2=df1.dropna()

In [5]:
df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [6]:
df2['bhk']= df2['size'].apply(lambda x: x.split(' ')[0])
df3 = df2.drop(['size'],axis = 'columns')
df3.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,location,total_sqft,bath,price,bhk
13315,Whitefield,3453,4.0,231.0,5
13316,Richards Town,3600,5.0,400.0,4
13317,Raja Rajeshwari Nagar,1141,2.0,60.0,2
13318,Padmanabhanagar,4689,4.0,488.0,4
13319,Doddathoguru,550,1.0,17.0,1


In [7]:
df3['bhk'].value_counts()


2     5527
3     4832
4     1395
1      649
5      353
6      221
7      100
8       89
9       54
10      14
11       4
13       1
18       1
14       1
19       1
27       1
43       1
16       1
12       1
Name: bhk, dtype: int64

In [8]:
def checkFloat(x):
    try:
        float(x)
    except:
        return False
    return True
        
df3[~df3.total_sqft.apply(checkFloat)].head()

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.0,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2
410,Kengeri,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,1195 - 1440,2.0,63.77,2
648,Arekere,4125Perch,9.0,265.0,9
661,Yelahanka,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,3090 - 5002,4.0,445.0,4


In [9]:
# Not going to tackle the unit conversions, however will fix the range problem by taking avg of two
def isSqft(x):
    try:
        if len(x.split('-'))>1:
            return float(x.split('-')[0]) + float(x.split('-')[1])/2
        else:
            return float(x)
    except:
        return None       


In [10]:
#df3['total_sqft']=df3.total_sqft.apply(lambda x: (float(x.split('-')[0]) + float(x.split('-')[1])/2) if len(x.split('-'))>1 else x)
df4 = df3.copy()
df4['total_sqft'] = df4.total_sqft.apply(isSqft)

In [25]:
# Feature Engineering

# Calculdate price/sqft
df5 =  df4.copy()
df5['price_sqft'] = df4['price']*100000/df4['total_sqft']
df5['price_sqft'] = df5['price_sqft'].astype(float)

In [26]:
df5.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [38]:
df5['location'] = df5['location'].apply(lambda x: x.strip())
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending =False)
location_stats[location_stats<=10]

In [41]:
df5.location = df5.location.apply(lambda x: 'other' if x in location_stats[location_stats<=10] else x)

In [45]:
len(df5.location.unique()) # This should help when we are gonna apply Onehotencoding

242