In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]=(20,10)

In [4]:
data_frame=pd.read_csv(".//data//Bengaluru_House_Data.csv")
data_frame.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
data_frame.shape

(13320, 9)

In [6]:
#Grouping data based on area

data_frame.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [7]:
data_frame.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [8]:
#Data cleaning
"""
In data cleaning most basic part is to handle null value. So first of all we will find
null values in all the columns and then we will replace it with some numeric data(like we
can taken median of all value and fill it with them) or we can drop them as well and for
non-numeric data either we can drop them or we can fill thwm with most common data which is
avilable in that column i.e. 
"""
#fetching all the null values in each columns
data_frame.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [9]:
data_frame['bath'] = data_frame['bath'].fillna(data_frame['bath'].median())
data_frame['balcony'] = data_frame['balcony'].fillna(data_frame['balcony'].median())


In [10]:
data_frame.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath               0
balcony            0
price              0
dtype: int64

In [11]:
"""
For non-numeric data we are finding the list of most used value and then we are replacing 
value at 0 index with 
"""
data_frame['location'] = data_frame['location'].fillna(data_frame['location'].mode()[0])
data_frame['society'] = data_frame['society'].fillna(data_frame['society'].mode()[0])
data_frame['size'] = data_frame['size'].fillna(data_frame['size'].mode()[0])


In [12]:
data_frame.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [29]:
data_frame['size'].unique() 
"""
If we see carefully in our data few of the values are same but they are written in 2 
different ways i.e. 6BHK and 6 Bedroom.
So we will solve this first.
"""

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [33]:
"""
Now we will divide the size column in 2 section first will contain numeric value and 
second will non-numeric. i.e 4 BHK can be split into 4 and BHK.
Then we will store this values in new column.
"""
data_frame['bhk']=data_frame['size'].apply(lambda x: int(x.split(' ')[0]))

In [37]:
data_frame.head()
"""
Now every value for bhk is same so we can drop our size column
"""

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,GrrvaGr,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,GrrvaGr,1200,2.0,1.0,51.0,2


In [None]:
data_frame.drop(columns=['size'], inplace=True)


In [51]:
data_frame.head()
"""
Now the size column has been removed and we have store it's value in bhk column now we 
have clean data
"""

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,Coomee,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,Theanmp,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,GrrvaGr,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,Soiewre,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,GrrvaGr,1200,2.0,1.0,51.0,2


In [53]:
#Doing some operation on our data
data_frame[data_frame.bhk>20]

"""
If we notice carefully in our total_sqft column data is not that accurate
for row-4684 if we see carefully it is written we have 34 bhk in just 2400 sq.ft data 
which definetly not correct 
"""

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
1718,Super built-up Area,Ready To Move,2Electronic City Phase II,GrrvaGr,8000,27.0,0.0,230.0,27
4684,Plot Area,Ready To Move,Munnekollal,GrrvaGr,2400,40.0,0.0,660.0,43


In [55]:
data_frame['total_sqft'].unique()
"""
if we notice few of our data are in range which is not correct so first of all we will try
to improve this by converting range into 1 fixed value i.e we have a range 1133 - 1384
we will take median of both the values and store it only. 
"""

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [76]:
def convert_sq_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)  
    except:
        return None #for values like 34.46Sq. Meter
    
data_frame['total_sqft'].apply(convert_sq_to_num)
data_frame['total_sqft'].head()

None


0    1056
1    2600
2    1440
3    1521
4    1200
Name: total_sqft, dtype: object