In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Train.csv')

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# Cleaning Data 

In [4]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
df1 = df.drop(['availability','society','balcony'],axis=1)

In [6]:
df1.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [7]:
df1['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [8]:
df1.dropna(inplace=True)

In [9]:
df1.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [10]:
df1['bhk'] = df1['size'].apply(lambda x: int(x.split(' ')[0]))

In [11]:
df1.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2


In [12]:
df1.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [13]:
def convert_sqft(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [14]:
convert_sqft('1133 - 1384')

1258.5

In [15]:
df2 = df1.copy()

In [16]:
df2['sqft'] = df2['total_sqft'].apply(convert_sqft)
df2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2,1056.0
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4,2600.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3,1440.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3,1521.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2,1200.0


In [17]:
df2.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [18]:
df3 = df2.copy()

In [19]:
df3['per_sqft_price'] = df3['price']*100000/df3['sqft']
df3.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,sqft,per_sqft_price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2,1056.0,3699.810606
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4,2600.0,4615.384615
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3,1440.0,4305.555556
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3,1521.0,6245.890861
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2,1200.0,4250.0


In [20]:
df3.location = df3.location.apply(lambda x: x.strip())
loc_stats = df3.groupby('location')['location'].agg('count').sort_values(ascending = False)
loc_stats

location
Whitefield           535
Sarjapur  Road       392
Electronic City      304
Kanakpura Road       266
Thanisandra          236
                    ... 
LIC Colony             1
Kuvempu Layout         1
Kumbhena Agrahara      1
Kudlu Village,         1
1 Annasandrapalya      1
Name: location, Length: 1293, dtype: int64

In [21]:
less_than_10 = loc_stats[loc_stats<10]

In [22]:
df3.location = df3.location.apply(lambda x: 'others' if x in less_than_10 else x)
len(df3.location.unique())

255

In [23]:
df3.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,sqft,per_sqft_price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2,1056.0,3699.810606
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4,2600.0,4615.384615
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3,1440.0,4305.555556
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3,1521.0,6245.890861
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2,1200.0,4250.0


In [24]:
df3.drop(['size','total_sqft'],axis='columns',inplace=True)

In [25]:
df3.head()

Unnamed: 0,area_type,location,bath,price,bhk,sqft,per_sqft_price
0,Super built-up Area,Electronic City Phase II,2.0,39.07,2,1056.0,3699.810606
1,Plot Area,Chikka Tirupathi,5.0,120.0,4,2600.0,4615.384615
2,Built-up Area,Uttarahalli,2.0,62.0,3,1440.0,4305.555556
3,Super built-up Area,Lingadheeranahalli,3.0,95.0,3,1521.0,6245.890861
4,Super built-up Area,Kothanur,2.0,51.0,2,1200.0,4250.0


# Removing Outliers

In [26]:
df3[df3.sqft/df3.bhk<300].head()

Unnamed: 0,area_type,location,bath,price,bhk,sqft,per_sqft_price
9,Plot Area,others,6.0,370.0,6,1020.0,36274.509804
45,Plot Area,HSR Layout,9.0,200.0,8,600.0,33333.333333
58,Plot Area,Murugeshpalya,4.0,150.0,6,1407.0,10660.98081
68,Plot Area,Devarachikkanahalli,7.0,85.0,8,1350.0,6296.296296
70,Plot Area,others,3.0,100.0,3,500.0,20000.0


In [27]:
df4 = df3[~(df3.sqft/df3.bhk<300)]

In [28]:
df4.shape

(12502, 7)

In [29]:
def remove_outliers(df):
    df_out = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.per_sqft_price)
        st = np.std(subdf.per_sqft_price)
        reduce_df = subdf[(subdf.per_sqft_price>(m-st))&(subdf.per_sqft_price<=(m+st))]
        df_out = pd.concat([df_out,reduce_df],ignore_index=True)
    return df_out

In [30]:
df5 = remove_outliers(df4)
df5.shape

(10222, 7)

In [31]:
def remove_bhk_outlier(df):
    exclude = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean':np.mean(bhk_df.per_sqft_price),
                'std':np.std(bhk_df.per_sqft_price),
                'count':bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude = np.append(exclude,bhk_df[bhk_df.per_sqft_price<(stats['mean'])].index.values)        
    return df.drop(exclude,axis='index')


In [32]:
df6 = remove_bhk_outlier(df5)
df6.shape

(7369, 7)

In [33]:
df6.bath.unique()

array([ 4.,  3.,  2.,  5.,  8.,  1.,  6., 14.,  7.,  9., 12., 16., 13.])

In [34]:
df6[df6.bath>10]

Unnamed: 0,area_type,location,bath,price,bhk,sqft,per_sqft_price
585,Plot Area,BTM 1st Stage,14.0,500.0,9,3300.0,15151.515152
5344,Super built-up Area,Neeladri Nagar,12.0,160.0,10,4000.0,4000.0
8547,Super built-up Area,others,12.0,525.0,10,12000.0,4375.0
8633,Super built-up Area,others,16.0,550.0,16,10000.0,5500.0
9329,Super built-up Area,others,12.0,150.0,11,6000.0,2500.0
9642,Super built-up Area,others,13.0,275.0,13,5425.0,5069.124424


In [35]:
df6[df6.bath>df6.bhk+2]

Unnamed: 0,area_type,location,bath,price,bhk,sqft,per_sqft_price
585,Plot Area,BTM 1st Stage,14.0,500.0,9,3300.0,15151.515152
1642,Built-up Area,Chikkabanavar,7.0,80.0,4,2460.0,3252.03252
5305,Built-up Area,Nagasandra,8.0,450.0,4,7000.0,6428.571429
6792,Super built-up Area,Thanisandra,6.0,116.0,3,1806.0,6423.03433
8476,Super built-up Area,others,9.0,1000.0,6,11338.0,8819.897689


In [36]:
df7 = df6[df6.bath<df6.bhk+2]

In [37]:
df7.shape

(7291, 7)

In [38]:
df8 = df7.drop(['per_sqft_price','area_type'],axis='columns')

# Developing The model

In [39]:
df8.head(3)

Unnamed: 0,location,bath,price,bhk,sqft
0,1st Block Jayanagar,4.0,428.0,4,2850.0
1,1st Block Jayanagar,3.0,194.0,3,1630.0
2,1st Block Jayanagar,2.0,235.0,3,1875.0


In [40]:
dummies = pd.get_dummies(df8.location)

In [41]:
df9 = pd.concat([df8,dummies.drop('others',axis='columns')],axis='columns')

In [42]:
df9.drop('location',axis='columns',inplace=True)

In [43]:
X = df9.drop('price',axis='columns')
y = df9.price

In [44]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [47]:
model.fit(X_train,y_train)

LinearRegression()

In [48]:
model.score(X_test,y_test)

0.8556662450161989

In [49]:
def predict_price(location, sqft , bath , bhk):
    loc = np.where(X.columns==location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc >=0:
        x[loc] = 1
    return model.predict([x])[0]

In [55]:
X.columns

Index(['bath', 'bhk', 'sqft', '1st Block Jayanagar', '1st Block Koramangala',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Block Hbr Layout', '5th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur'],
      dtype='object', length=257)

In [None]:
predict_price('1st Block Jayanagar',2850,4,4)

In [None]:
predict_price('Vishveshwarya Layout',100,2,2)