In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline
pd.set_option('display.max_rows', 10)
# Show the full results
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)

matplotlib.rcParams['figure.figsize'] = (20, 10)

In [62]:
house = pd.read_csv('../data/Bengaluru_House_Data.csv')
house.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [63]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [64]:
house.shape

(13320, 9)

In [65]:
house.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [66]:
house.drop(['area_type', 'society', 'availability', 'balcony'], axis = 1, inplace = True)

In [67]:
house.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [68]:
house.dropna(inplace = True)
house.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [69]:
df = house.copy()

In [70]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [71]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [72]:
df['size'] = df['size'].apply(lambda x: x.split(' ')[0])

In [73]:
df['size'].unique()

array(['2', '4', '3', '6', '1', '8', '7', '5', '11', '9', '27', '10',
       '19', '16', '43', '14', '12', '13', '18'], dtype=object)

In [74]:
df['size'] = df['size'].astype('int')

In [75]:
df[df['size'] == 43]

Unnamed: 0,location,size,total_sqft,bath,price
4684,Munnekollal,43,2400,40.0,660.0


In [76]:
print(df['total_sqft'].unique())

['1056' '2600' '1440' ... '1133 - 1384' '774' '4689']


In [77]:
df[~df['total_sqft'].str.isdigit()].head(10)

Unnamed: 0,location,size,total_sqft,bath,price
30,Yelahanka,4,2100 - 2850,4.0,186.0
44,Kanakpura Road,2,1330.74,2.0,91.79
122,Hebbal,4,3067 - 8156,4.0,477.0
137,8th Phase JP Nagar,2,1042 - 1105,2.0,54.005
142,Kasavanhalli,3,1563.05,3.0,105.0
165,Sarjapur,2,1145 - 1340,2.0,43.49
188,KR Puram,2,1015 - 1540,2.0,56.8
373,Gopalapura,3,2023.71,3.0,275.0
393,Electronics City Phase 1,2,1113.27,2.0,53.0
410,Kengeri,1,34.46Sq. Meter,1.0,18.5


In [78]:
def convert_range_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None
df['total_sqft'] = df['total_sqft'].apply(lambda x: convert_range_to_num(x))

In [79]:
df.dropna(subset = ['total_sqft'], inplace = True)

In [81]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [82]:
df.shape

(13200, 5)

In [83]:
df.to_csv('../Data/Cleaned.csv', index=False)