In [696]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
plt.rcParams['figure.figsize'] = (20,10)

'''%matplotlib inline --> without it the code will be run but no plot will appeare
to control the defaults of almost every property in Matplotlib: figure size and DPI, line width,
color and style, axes, axis and grid properties, text and font properties and so on.'''

In [697]:
# read csv file
df1 = pd.read_csv('DataSet/bengaluru_house_prices.csv')
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [698]:
# number of columns and rows 
df1.shape
# shape() --> TypeError: 'tuple' object is not callable

(13320, 9)

In [699]:
# examine area_type featuer
df1.groupby('area_type')['area_type'].agg('count')
# df1.groupby('area_type')['area_type'].count()

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [700]:
# to simplify the model --> drop some columns
df2 = df1.drop(['area_type','availability', 'society' , 'balcony'] , axis = 'columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [701]:
# data cleaning process
# tell me the number of rows where particulare columns is null --> isnull()
# tell me the number of rows where particulare columns is not null --> notnull()
# sum of isnull() and notnull() == count from shape
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [702]:
# drop the null values 
# i have 2 choises here : if the number of null is too small in comparesion with the number of columns 
# (in my case 90 from 13320) -- > drop it 
# else if it represent a high pergentage --> take the median and fill the null with it
df3 = df2.dropna()
# --> check if they are droped
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [703]:
# navigate the size column 
df3['size'].unique()
# give me all the unique values in the column

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [704]:
# BHK = Bedroom --> so i must keep the number only to avoid under or over estimation in the model
# by split the string 'which give me a list of char then keep in char with index 0 only' and save them in a new column
# the new column 'bhk'
# by .apply() i can apply any method of function between the () to a selected elements
df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))


Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [705]:
# df3.drop(['size'], axis = 'columns')
# df3.head()
df3['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [706]:
df3 = df3.drop(['size'] , axis = 'columns')
df3.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2


In [707]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [708]:
from numpy import mean
def clean_total_sqft(x):
    y = x.split('-')
    if len(y) == 2 :
        return (float(y[0]) + float(y[1]))/2
    
    try:
        return float(x)
    except:
        # numbers = [int(i) for i in z if type(i)==type(5)]
        return None    

In [709]:
clean_total_sqft('50a')

In [710]:
df4 = df3.copy()
df4.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2


In [711]:
# df4['total_sqft']= df4['total_sqft'].apply(clean_total_sqft)
df4.total_sqft= df4.total_sqft.apply(clean_total_sqft)

In [712]:
df4.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [713]:
df4.loc[648]

location      Arekere
total_sqft        NaN
bath              9.0
price           265.0
bhk                 9
Name: 648, dtype: object

In [714]:
df4.isnull().sum()

location       0
total_sqft    46
bath           0
price          0
bhk            0
dtype: int64

In [715]:
df4 = df4.dropna()
df4.isnull().sum()

location      0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [716]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000 /df5['total_sqft']
df5.location= df5.location.apply(lambda x : x.strip())
df5.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [717]:
len(df5['location'].unique())

1287

In [718]:
location_state = df5.groupby('location')['location'].count().sort_values(ascending= False)
location_state

location
Whitefield               533
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           264
Thanisandra              235
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Kannur                     1
whitefiled                 1
Name: location, Length: 1287, dtype: int64

In [719]:
location_less_than_10 = location_state[location_state <= 10]
len(location_less_than_10)

1047

In [720]:
df5.location = df5.location.apply(lambda x : 'other' if x in location_less_than_10 else x )
len(df5.location.unique())

241

In [721]:
df5.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [722]:
df5.shape

(13200, 6)

In [723]:
df6 = df5[~(df5.total_sqft/ df5.bhk < 300)]
df6.shape

(12456, 6)